Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on Apr 23

Commit

4f93b3f

unverified ·

2 Parent(s): c27db98 a56b9b0

Merge pull request #16 from seanpedrick-case/dev

Browse files

Added id and text properties to annotation object. Other minor changes.

Files changed (8) hide show

app.py +34 -17
requirements.txt +1 -2
tools/auth.py +3 -1
tools/aws_functions.py +0 -4
tools/file_conversion.py +375 -183
tools/file_redaction.py +35 -13
tools/redaction_review.py +112 -34
tools/textract_batch_call.py +10 -6

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import identify_similar_pages
-from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
@@ -153,6 +153,8 @@ with app:
     s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
     s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
     load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
     s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
@@ -164,7 +166,7 @@ with app:
     default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
     # Base tables that are not modified subsequent to load
-    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
     all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
     all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
     cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
@@ -203,6 +205,7 @@ with app:
     job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
     textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
     ###
     # UI DESIGN
@@ -263,8 +266,10 @@ with app:
                             job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
                             check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
                     with gr.Row():
-                        job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
-                        textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
             document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
@@ -298,8 +303,8 @@ with app:
             with gr.Column(scale=2):
                 with gr.Row(equal_height=True):
                     annotation_last_page_button = gr.Button("Previous page", scale = 4)
-                    annotate_current_page = gr.Number(value=0, label="Current page", precision=0, scale = 2, min_width=50)
-                    annotate_max_pages = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
                     annotation_next_page_button = gr.Button("Next page", scale = 4)
                 zoom_str = str(annotator_zoom_number) + '%'
@@ -336,7 +341,7 @@ with app:
                         recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
                         page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
                     text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
-                    recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
                     with gr.Row(equal_height=True):
                         exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
@@ -346,7 +351,9 @@ with app:
                     undo_last_removal_btn = gr.Button(value="Undo last element removal")
-                    selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
@@ -520,6 +527,13 @@ with app:
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     ###
     # REVIEW PDF REDACTIONS
@@ -546,17 +560,22 @@ with app:
     # Apply page redactions
     annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
     # Review table controls
     recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
     page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
     text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
-    recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page, selected_entity_dataframe_row])#.\
-        #success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state], outputs=[review_file_state]).\
-        #success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, annotate_current_page, annotate_previous_page, all_image_annotations_state, annotator], outputs=[annotator, all_image_annotations_state])
     reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
@@ -577,9 +596,7 @@ with app:
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
-    update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     # Review OCR text buttom
     all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
@@ -717,7 +734,7 @@ if __name__ == "__main__":
     if RUN_DIRECT_MODE == "0":
         if os.environ['COGNITO_AUTH'] == "1":
-            app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
         else:
             app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)

 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import identify_similar_pages
+from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
     s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
     s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
+    no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
+    textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
     load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
     s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
     default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
     # Base tables that are not modified subsequent to load
+    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
     all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
     all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
     cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
     job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
     textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
+    convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
     ###
     # UI DESIGN
                             job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
                             check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
                     with gr.Row():
+                        job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
+                        textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
+                    convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=True)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
             document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
             with gr.Column(scale=2):
                 with gr.Row(equal_height=True):
                     annotation_last_page_button = gr.Button("Previous page", scale = 4)
+                    annotate_current_page = gr.Number(value=1, label="Current page", precision=0, scale = 2, min_width=50)
+                    annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
                     annotation_next_page_button = gr.Button("Next page", scale = 4)
                 zoom_str = str(annotator_zoom_number) + '%'
                         recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
                         page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
                     text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
+                    recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=(4,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
                     with gr.Row(equal_height=True):
                         exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
                     undo_last_removal_btn = gr.Button(value="Undo last element removal")
+                    selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True)
+                    selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
+                    selected_entity_colour = gr.Textbox(value="", label="selected_entity_colour", visible=False)
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
+    convert_textract_outputs_to_ocr_results.click(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
+        success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
+        success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
+        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path])
     ###
     # REVIEW PDF REDACTIONS
     # Apply page redactions
     annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
+    # Save current page redactions
+    update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+    success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+    success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     # Review table controls
     recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
     page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
     text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
+    # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
+    recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page, selected_entity_dataframe_row]).\
+        success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour, page_sizes], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
+        success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, annotate_current_page, annotate_previous_page, all_image_annotations_state, annotator], outputs=[annotator, all_image_annotations_state])
     reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     # Review OCR text buttom
     all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
     if RUN_DIRECT_MODE == "0":
         if os.environ['COGNITO_AUTH'] == "1":
+            app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
         else:
             app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)

requirements.txt CHANGED Viewed

@@ -10,7 +10,6 @@ pandas==2.2.3
 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 gradio==5.25.2
 boto3==1.37.29
 pyarrow==19.0.1
@@ -19,7 +18,7 @@ Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
 # The following version
-https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.1/gradio_image_annotation-0.3.1-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
 rapidfuzz==3.12.1
 python-dotenv==1.0.1
 numpy==1.26.4

 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 gradio==5.25.2
 boto3==1.37.29
 pyarrow==19.0.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
 # The following version
+https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.2/gradio_image_annotation-0.3.2-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
 rapidfuzz==3.12.1
 python-dotenv==1.0.1
 numpy==1.26.4

tools/auth.py CHANGED Viewed

@@ -69,5 +69,7 @@ def authenticate_user(username:str, password:str, user_pool_id:str=AWS_USER_POOL
     except client.exceptions.UserNotFoundException:
         return False
     except Exception as e:
-        print(f"An error occurred: {e}")
         return False

     except client.exceptions.UserNotFoundException:
         return False
     except Exception as e:
+        out_message = f"An error occurred: {e}"
+        print(out_message)
+        raise Exception(out_message)
         return False

tools/aws_functions.py CHANGED Viewed

@@ -42,10 +42,6 @@ def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str
     if RUN_AWS_FUNCTIONS == "1":
         try:
-            print("bucket_name:", bucket_name)
-            print("key:", key)
-            print("local_file_path_and_name:", local_file_path_and_name)
             # Ensure the local directory exists
             os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)

     if RUN_AWS_FUNCTIONS == "1":
         try:
             # Ensure the local directory exists
             os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)

tools/file_conversion.py CHANGED Viewed

@@ -19,6 +19,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 from pdf2image import convert_from_path
 from PIL import Image
 from scipy.spatial import cKDTree
 IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
@@ -834,10 +836,10 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
-         # Remove 'text' elements from boxes
-        for item in non_empty_boxes:
-            if 'boxes' in item:
-                item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
         if non_empty_boxes:
             # Keep the first entry with non-empty boxes
@@ -855,13 +857,19 @@ def divide_coordinates_by_page_sizes(review_file_df:pd.DataFrame, page_sizes_df:
     review_file_df_out = review_file_df
     if xmin in review_file_df.columns and not review_file_df.empty:
         review_file_df_orig = review_file_df.copy().loc[(review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) & (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1),:]
-        review_file_df = review_file_df.loc[(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) & (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1),:]
-        review_file_df.loc[:, "page"] = pd.to_numeric(review_file_df["page"], errors="coerce")
-        review_file_df_div = review_file_df
         if "image_width" not in review_file_df_div.columns and not page_sizes_df.empty:
@@ -902,6 +910,11 @@ def multiply_coordinates_by_page_sizes(review_file_df: pd.DataFrame, page_sizes_
     if xmin in review_file_df.columns and not review_file_df.empty:
         # Separate absolute vs relative coordinates
         review_file_df_orig = review_file_df.loc[
             (review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) &
@@ -1014,6 +1027,12 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
     if not 'text' in df2.columns: df2['text'] = ''
     if not 'text' in df1.columns: df1['text'] = ''
     # Create a unique key based on coordinates and label for exact merge
     merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
     df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
@@ -1031,6 +1050,8 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
     # Handle missing matches using a proximity-based approach
     # Convert coordinates to numpy arrays for KDTree lookup
     query_coords = np.array(df1[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=float)
     # Check for NaN or infinite values in query_coords and filter them out
@@ -1064,9 +1085,6 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
     return merged_df
 def _extract_page_number(image_path: Any) -> int:
     """Helper function to safely extract page number."""
     if not isinstance(image_path, str):
@@ -1085,7 +1103,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
     '''
     if not all_annotations:
         # Return an empty DataFrame with the expected schema if input is empty
-        return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text"])
     # 1. Create initial DataFrame from the list of annotations
     # Use list comprehensions with .get() for robustness
@@ -1102,7 +1120,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
     # Explode removes rows where the list is empty. We want to keep them
     # as rows with NA values. Replace empty lists with a list containing
     # a single placeholder dictionary.
-    placeholder_box = {"xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA, "text": pd.NA}
     df['boxes'] = df['boxes'].apply(lambda x: x if x else [placeholder_box])
     # 4. Explode the 'boxes' column. Each item in the list becomes a new row.
@@ -1124,7 +1142,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
     # prevents this from being necessary.
     # 7. Ensure essential columns exist and set column order
-    essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text"]
     for col in essential_box_cols:
         if col not in final_df.columns:
             final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
@@ -1140,71 +1158,6 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
     return final_df
-# def convert_annotation_data_to_dataframe(all_annotations:List[dict]):
-#         '''
-#         Convert an annotation list of dictionaries to a dataframe with all boxes on a separate row
-#         '''
-#         # Flatten the data
-#         flattened_annotation_data = []
-#         for annotation in all_annotations:
-#             image_path = annotation["image"]
-#             if image_path:
-#                 match = re.search(r'_(\d+)\.png$', image_path)
-#                 if match:
-#                     number = match.group(1)
-#                     reported_number = int(number) + 1
-#                 else:
-#                     reported_number = 1
-#             else:
-#                 reported_number = 1
-#             # Check if 'boxes' is in the annotation, if not, add an empty list
-#             if 'boxes' not in annotation:
-#                 annotation['boxes'] = []
-#             # If boxes are empty, create a row with blank values for xmin, xmax, ymin, ymax
-#             if not annotation["boxes"]:
-#                 data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA}
-#                 flattened_annotation_data.append(data_to_add)
-#             else:
-#                 for box in annotation["boxes"]:
-#                     if 'xmin' not in box:
-#                         data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, 'xmax': pd.NA, 'ymin': pd.NA, 'ymax': pd.NA}
-#                     elif 'text' not in box:
-#                         data_to_add = {"image": image_path, "page": reported_number, **box}
-#                     else:
-#                         data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
-#                     flattened_annotation_data.append(data_to_add)
-#         # Convert to a DataFrame
-#         review_file_df = pd.DataFrame(flattened_annotation_data)
-#         return review_file_df
-# def create_annotation_dicts_from_annotation_df(all_image_annotations_df:pd.DataFrame, page_sizes:List[dict]):
-#         '''
-#         From an annotation object as a dataframe, convert back to a list of dictionaries that can be used in the Gradio Image Annotator component
-#         '''
-#         result = []
-#         # Ensure that every page has an entry in the resulting list of dicts
-#         for image_path in page_sizes:
-#             annotation = {}
-#             annotation["image"] = image_path["image_path"]
-#             annotation["boxes"] = []
-#             result.append(annotation)
-#         # Then add in all the filled in data
-#         for image, group in all_image_annotations_df.groupby('image'):
-#             boxes = group[['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']].to_dict(orient='records')
-#             result.append({'image': image, 'boxes': boxes})
-#         return result
 def create_annotation_dicts_from_annotation_df(
     all_image_annotations_df: pd.DataFrame,
     page_sizes: List[Dict[str, Any]]
@@ -1228,9 +1181,12 @@ def create_annotation_dicts_from_annotation_df(
     # 2. Define columns to extract for boxes and check availability
     # Make sure these columns actually exist in the DataFrame
-    box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
     available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
     if not available_cols:
         print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
         return list(image_dict.values()) # Return based on page_sizes only
@@ -1248,7 +1204,6 @@ def create_annotation_dicts_from_annotation_df(
          print("Warning: No valid annotation rows found in DataFrame after dropping NA coordinates.")
          return list(image_dict.values())
     # Process groups
     try:
         for image_path, group in valid_box_df.groupby('image', observed=True, sort=False):
@@ -1271,122 +1226,353 @@ def create_annotation_dicts_from_annotation_df(
     return result
-# import pandas as pd
-# from typing import List, Dict, Any
-# def create_annotation_dicts_from_annotation_df(
-#     all_image_annotations_df: pd.DataFrame,
-#     page_sizes: List[Dict[str, Any]]
-# ) -> List[Dict[str, Any]]:
-#     '''
-#     Convert annotation DataFrame back to list of dicts using Pandas merge.
-#     Ensures all images from page_sizes are present without duplicates.
-#     '''
-#     # 1. Create a DataFrame containing all required image paths from page_sizes
-#     if not page_sizes:
-#         return []
-#     all_image_paths = [item.get("image_path") for item in page_sizes if item.get("image_path")]
-#     if not all_image_paths:
-#         return []
-#     # Use unique paths
-#     pages_df = pd.DataFrame({'image': list(set(all_image_paths))})
-#     # Check if the DataFrame is empty or lacks necessary columns
-#     if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
-#         print("Warning: Annotation DataFrame is empty or missing 'image' column.")
-#         # Add empty boxes column and return
-#         pages_df['boxes'] = [[] for _ in range(len(pages_df))]
-#         return pages_df.to_dict(orient='records')
-#     # 2. Define columns to extract and check availability
-#     box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
-#     available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
-#     if not available_cols:
-#         print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
-#         pages_df['boxes'] = [[] for _ in range(len(pages_df))]
-#         return pages_df.to_dict(orient='records')
-#     # 3. Prepare the annotation data: drop invalid rows and aggregate boxes
-#     coord_cols = ['xmin', 'ymin', 'xmax', 'ymax']
-#     valid_box_df = all_image_annotations_df.dropna(
-#         subset=[col for col in coord_cols if col in available_cols]
-#     ).copy() # Use .copy()
-#     if valid_box_df.empty:
-#          print("Warning: No valid annotation rows found after dropping NA coordinates.")
-#          pages_df['boxes'] = [[] for _ in range(len(pages_df))]
-#          return pages_df.to_dict(orient='records')
-#     # Aggregate boxes into lists of dictionaries per image
-#     def aggregate_boxes(group):
-#         return group[available_cols].to_dict(orient='records')
-#     # Group by image and apply the aggregation
-#     grouped_boxes = valid_box_df.groupby('image', observed=True, sort=False).apply(aggregate_boxes).reset_index(name='boxes')
-#     # 4. Perform a left merge: keep all images from pages_df, add boxes where they exist
-#     merged_df = pd.merge(pages_df, grouped_boxes, on='image', how='left')
-#     # 5. Fill NaN in 'boxes' column (for images with no annotations) with empty lists
-#     # Ensure the column exists before trying to fillna
-#     if 'boxes' in merged_df.columns:
-#          # Use apply with a lambda for robust filling of NAs or potential None values
-#          merged_df['boxes'] = merged_df['boxes'].apply(lambda x: [] if pd.isna(x) else x)
-#     else:
-#          # Should not happen with left merge, but handle defensively
-#          merged_df['boxes'] = [[] for _ in range(len(merged_df))]
-#     # 6. Convert the final DataFrame to the list of dictionaries format
-#     result = merged_df.to_dict(orient='records')
-#     return result
-def convert_annotation_json_to_review_df(all_annotations:List[dict],
-                                         redaction_decision_output:pd.DataFrame=pd.DataFrame(),
-                                         page_sizes:pd.DataFrame=pd.DataFrame(),
-                                         do_proximity_match:bool=True) -> pd.DataFrame:
     '''
-    Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (if option selected).
     '''
-    review_file_df = convert_annotation_data_to_dataframe(all_annotations)
-    if page_sizes:
-        page_sizes_df = pd.DataFrame(page_sizes)
-        page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
-        review_file_df = divide_coordinates_by_page_sizes(review_file_df, page_sizes_df)
-        redaction_decision_output = divide_coordinates_by_page_sizes(redaction_decision_output, page_sizes_df)
-    # Join on additional text data from decision output results if included, if text not already there
-    if not redaction_decision_output.empty and not review_file_df.empty and do_proximity_match == True:
-        # Match text to review file to match on text
-        review_file_df = do_proximity_match_all_pages_for_text(df1 = review_file_df.copy(), df2 = redaction_decision_output.copy())
-    # Ensure required columns exist, filling with blank if they don't
-    check_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]
-    for col in check_columns:
         if col not in review_file_df.columns:
             review_file_df[col] = ''
-    if not review_file_df.empty:
-        review_file_df = review_file_df[check_columns]
-    else:
-        review_file_df = pd.DataFrame(columns=check_columns)
     # If colours are saved as list, convert to tuple
-    review_file_df.loc[:,"color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
-    review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
     return review_file_df
 def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
                                          image_paths:List[Image.Image],
                                          page_sizes:List[dict]=[]) -> List[dict]:
@@ -1404,9 +1590,15 @@ def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
         page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
         review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df)
     # Keep only necessary columns
-    review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax"]].drop_duplicates(subset=["image", "page", "xmin", "ymin", "xmax", "ymax", "label"])
     # If colours are saved as list, convert to tuple
     review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)

 from pdf2image import convert_from_path
 from PIL import Image
 from scipy.spatial import cKDTree
+import random
+import string
 IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
+         # Remove 'text' elements from boxes (deprecated)
+        #for item in non_empty_boxes:
+        #    if 'boxes' in item:
+        #        item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
         if non_empty_boxes:
             # Keep the first entry with non-empty boxes
     review_file_df_out = review_file_df
     if xmin in review_file_df.columns and not review_file_df.empty:
+        coord_cols = [xmin, xmax, ymin, ymax]
+        for col in coord_cols:
+            review_file_df.loc[:, col] = pd.to_numeric(review_file_df[col], errors="coerce")
         review_file_df_orig = review_file_df.copy().loc[(review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) & (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1),:]
+        #print("review_file_df_orig:", review_file_df_orig)
+        review_file_df_div = review_file_df.loc[(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) & (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1),:]
+        #print("review_file_df_div:", review_file_df_div)
+        review_file_df_div.loc[:, "page"] = pd.to_numeric(review_file_df_div["page"], errors="coerce")
         if "image_width" not in review_file_df_div.columns and not page_sizes_df.empty:
     if xmin in review_file_df.columns and not review_file_df.empty:
+        coord_cols = [xmin, xmax, ymin, ymax]
+        for col in coord_cols:
+            review_file_df.loc[:, col] = pd.to_numeric(review_file_df[col], errors="coerce")
         # Separate absolute vs relative coordinates
         review_file_df_orig = review_file_df.loc[
             (review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) &
     if not 'text' in df2.columns: df2['text'] = ''
     if not 'text' in df1.columns: df1['text'] = ''
+    for col in ['xmin', 'ymin', 'xmax', 'ymax']:
+        df1[col] = pd.to_numeric(df1[col], errors='coerce')
+    for col in ['xmin', 'ymin', 'xmax', 'ymax']:
+        df2[col] = pd.to_numeric(df2[col], errors='coerce')
     # Create a unique key based on coordinates and label for exact merge
     merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
     df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
     # Handle missing matches using a proximity-based approach
     # Convert coordinates to numpy arrays for KDTree lookup
     query_coords = np.array(df1[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=float)
     # Check for NaN or infinite values in query_coords and filter them out
     return merged_df
 def _extract_page_number(image_path: Any) -> int:
     """Helper function to safely extract page number."""
     if not isinstance(image_path, str):
     '''
     if not all_annotations:
         # Return an empty DataFrame with the expected schema if input is empty
+        return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
     # 1. Create initial DataFrame from the list of annotations
     # Use list comprehensions with .get() for robustness
     # Explode removes rows where the list is empty. We want to keep them
     # as rows with NA values. Replace empty lists with a list containing
     # a single placeholder dictionary.
+    placeholder_box = {"xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA, "text": pd.NA, "id": pd.NA}
     df['boxes'] = df['boxes'].apply(lambda x: x if x else [placeholder_box])
     # 4. Explode the 'boxes' column. Each item in the list becomes a new row.
     # prevents this from being necessary.
     # 7. Ensure essential columns exist and set column order
+    essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id"]
     for col in essential_box_cols:
         if col not in final_df.columns:
             final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
     return final_df
 def create_annotation_dicts_from_annotation_df(
     all_image_annotations_df: pd.DataFrame,
     page_sizes: List[Dict[str, Any]]
     # 2. Define columns to extract for boxes and check availability
     # Make sure these columns actually exist in the DataFrame
+    box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label', 'text', 'id']
     available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
+    if 'text' in all_image_annotations_df.columns:
+        all_image_annotations_df.loc[all_image_annotations_df['text'].isnull(), 'text'] = ''
     if not available_cols:
         print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
         return list(image_dict.values()) # Return based on page_sizes only
          print("Warning: No valid annotation rows found in DataFrame after dropping NA coordinates.")
          return list(image_dict.values())
     # Process groups
     try:
         for image_path, group in valid_box_df.groupby('image', observed=True, sort=False):
     return result
+def convert_annotation_json_to_review_df(all_annotations: List[dict],
+                                         redaction_decision_output: pd.DataFrame = pd.DataFrame(),
+                                         page_sizes: List[dict] = [],
+                                         do_proximity_match: bool = True) -> pd.DataFrame:
     '''
+    Convert the annotation json data to a dataframe format.
+    Add on any text from the initial review_file dataframe by joining based on 'id' if available
+    in both sources, otherwise falling back to joining on pages/co-ordinates (if option selected).
     '''
+    # 1. Convert annotations to DataFrame
+    # Ensure convert_annotation_data_to_dataframe populates the 'id' column
+    # if 'id' exists in the dictionaries within all_annotations.
+    review_file_df = convert_annotation_data_to_dataframe(all_annotations)
+    # Only keep rows in review_df where there are coordinates
+    review_file_df.dropna(subset='xmin', axis=0, inplace=True)
+    # Exit early if the initial conversion results in an empty DataFrame
+    if review_file_df.empty:
+        # Define standard columns for an empty return DataFrame
+        check_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text", "id"]
+        # Ensure 'id' is included if it might have been expected
+        return pd.DataFrame(columns=[col for col in check_columns if col != 'id' or 'id' in review_file_df.columns])
+    # 2. Handle page sizes if provided
+    if not page_sizes:
+        page_sizes_df = pd.DataFrame(page_sizes) # Ensure it's a DataFrame
+        # Safely convert page column to numeric
+        page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+        page_sizes_df.dropna(subset=["page"], inplace=True) # Drop rows where conversion failed
+        page_sizes_df["page"] = page_sizes_df["page"].astype(int) # Convert to int after handling errors/NaNs
+        # Apply coordinate division if page_sizes_df is not empty after processing
+        if not page_sizes_df.empty:
+            # Ensure 'page' column in review_file_df is numeric for merging
+            if 'page' in review_file_df.columns:
+                 review_file_df['page'] = pd.to_numeric(review_file_df['page'], errors='coerce')
+                 # Drop rows with invalid pages before division
+                 review_file_df.dropna(subset=['page'], inplace=True)
+                 review_file_df['page'] = review_file_df['page'].astype(int)
+                 review_file_df = divide_coordinates_by_page_sizes(review_file_df, page_sizes_df)
+                 print("review_file_df after coord divide:", review_file_df)
+            # Also apply to redaction_decision_output if it's not empty and has page numbers
+            if not redaction_decision_output.empty and 'page' in redaction_decision_output.columns:
+                redaction_decision_output['page'] = pd.to_numeric(redaction_decision_output['page'], errors='coerce')
+                # Drop rows with invalid pages before division
+                redaction_decision_output.dropna(subset=['page'], inplace=True)
+                redaction_decision_output['page'] = redaction_decision_output['page'].astype(int)
+                redaction_decision_output = divide_coordinates_by_page_sizes(redaction_decision_output, page_sizes_df)
+                print("redaction_decision_output after coord divide:", redaction_decision_output)
+        else:
+             print("Warning: Page sizes DataFrame became empty after processing, skipping coordinate division.")
+    # 3. Join additional data from redaction_decision_output if provided
+    if not redaction_decision_output.empty:
+        # --- NEW LOGIC: Prioritize joining by 'id' ---
+        id_col_exists_in_review = 'id' in review_file_df.columns
+        id_col_exists_in_redaction = 'id' in redaction_decision_output.columns
+        joined_by_id = False # Flag to track if ID join was successful
+        if id_col_exists_in_review and id_col_exists_in_redaction:
+            #print("Attempting to join data based on 'id' column.")
+            try:
+                # Ensure 'id' columns are of compatible types (e.g., string) to avoid merge errors
+                review_file_df['id'] = review_file_df['id'].astype(str)
+                # Make a copy to avoid SettingWithCopyWarning if redaction_decision_output is used elsewhere
+                redaction_copy = redaction_decision_output.copy()
+                redaction_copy['id'] = redaction_copy['id'].astype(str)
+                # Select columns to merge from redaction output.
+                # Primarily interested in 'text', but keep 'id' for the merge key.
+                # Add other columns from redaction_copy if needed.
+                cols_to_merge = ['id']
+                if 'text' in redaction_copy.columns:
+                    cols_to_merge.append('text')
+                else:
+                    print("Warning: 'text' column not found in redaction_decision_output. Cannot merge text using 'id'.")
+                # Perform a left merge to keep all annotations and add matching text
+                # Suffixes prevent collision if 'text' already exists and we want to compare/choose
+                original_cols = review_file_df.columns.tolist()
+                merged_df = pd.merge(
+                    review_file_df,
+                    redaction_copy[cols_to_merge],
+                    on='id',
+                    how='left',
+                    suffixes=('', '_redaction') # Suffix applied to columns from right df if names clash
+                )
+                # Update the original 'text' column. Prioritize text from redaction output.
+                # If redaction output had 'text', a 'text_redaction' column now exists.
+                if 'text_redaction' in merged_df.columns:
+                     if 'text' not in merged_df.columns: # If review_file_df didn't have text initially
+                         merged_df['text'] = merged_df['text_redaction']
+                     else:
+                         # Use text from redaction where available, otherwise keep original text
+                         merged_df['text'] = merged_df['text_redaction'].combine_first(merged_df['text'])
+                     # Remove the temporary column
+                     merged_df = merged_df.drop(columns=['text_redaction'])
+                # Ensure final columns match original expectation + potentially new 'text'
+                final_cols = original_cols
+                if 'text' not in final_cols and 'text' in merged_df.columns:
+                    final_cols.append('text') # Make sure text column is kept if newly added
+                 # Reorder/select columns if necessary, ensuring 'id' is kept
+                review_file_df = merged_df[[col for col in final_cols if col in merged_df.columns] + (['id'] if 'id' not in final_cols else [])]
+                #print("Successfully joined data using 'id'.")
+                joined_by_id = True
+            except Exception as e:
+                print(f"Error during 'id'-based merge: {e}. Falling back to proximity match if enabled.")
+                # Fall through to proximity match below if an error occurred
+        # --- Fallback to proximity match ---
+        if not joined_by_id and do_proximity_match:
+            if not id_col_exists_in_review or not id_col_exists_in_redaction:
+                 print("Could not join by 'id' (column missing in one or both sources).")
+            print("Performing proximity match to add text data.")
+            # Match text to review file using proximity
+            review_file_df = do_proximity_match_all_pages_for_text(df1=review_file_df.copy(), df2=redaction_decision_output.copy())
+        elif not joined_by_id and not do_proximity_match:
+             print("Skipping joining text data (ID join not possible, proximity match disabled).")
+        # --- End of join logic ---
+    # 4. Ensure required columns exist, filling with blank if they don't
+    # Define base required columns, 'id' might or might not be present initially
+    required_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]
+    # Add 'id' to required list if it exists in the dataframe at this point
+    if 'id' in review_file_df.columns:
+        required_columns.append('id')
+    for col in required_columns:
         if col not in review_file_df.columns:
+            # Decide default value based on column type (e.g., '' for text, np.nan for numeric?)
+            # Using '' for simplicity here.
             review_file_df[col] = ''
+    # Select and order the final set of columns
+    review_file_df = review_file_df[required_columns]
+    # 5. Final processing and sorting
     # If colours are saved as list, convert to tuple
+    if 'color' in review_file_df.columns:
+        review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
+    # Sort the results
+    sort_columns = ['page', 'ymin', 'xmin', 'label']
+    # Ensure sort columns exist before sorting
+    valid_sort_columns = [col for col in sort_columns if col in review_file_df.columns]
+    if valid_sort_columns:
+        review_file_df = review_file_df.sort_values(valid_sort_columns)
     return review_file_df
+def fill_missing_box_ids(data_input: dict) -> dict:
+    """
+    Generates unique alphanumeric IDs for bounding boxes in an input dictionary
+    where the 'id' is missing, blank, or not a 12-character string.
+    Args:
+        data_input (dict): The input dictionary containing 'image' and 'boxes' keys.
+                           'boxes' should be a list of dictionaries, each potentially
+                           with an 'id' key.
+    Returns:
+        dict: The input dictionary with missing/invalid box IDs filled.
+              Note: The function modifies the input dictionary in place.
+    """
+    # --- Input Validation ---
+    if not isinstance(data_input, dict):
+        raise TypeError("Input 'data_input' must be a dictionary.")
+    #if 'boxes' not in data_input or not isinstance(data_input.get('boxes'), list):
+    #    raise ValueError("Input dictionary must contain a 'boxes' key with a list value.")
+    boxes = data_input#['boxes']
+    id_length = 12
+    character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
+    # --- Get Existing IDs to Ensure Uniqueness ---
+    # Collect all valid existing IDs first
+    existing_ids = set()
+    #for box in boxes:
+    # Check if 'id' exists, is a string, and is the correct length
+    box_id = boxes.get('id')
+    if isinstance(box_id, str) and len(box_id) == id_length:
+        existing_ids.add(box_id)
+    # --- Identify and Fill Rows Needing IDs ---
+    generated_ids_set = set() # Keep track of IDs generated *in this run*
+    num_filled = 0
+    #for box in boxes:
+    box_id = boxes.get('id')
+    # Check if ID needs to be generated
+    # Needs ID if: key is missing, value is None, value is not a string,
+    # value is an empty string after stripping whitespace, or value is a string
+    # but not of the correct length.
+    needs_new_id = (
+        box_id is None or
+        not isinstance(box_id, str) or
+        box_id.strip() == "" or
+        len(box_id) != id_length
+    )
+    if needs_new_id:
+        # Generate a unique ID
+        attempts = 0
+        while True:
+            candidate_id = ''.join(random.choices(character_set, k=id_length))
+            # Check against *all* existing valid IDs and *newly* generated ones in this run
+            if candidate_id not in existing_ids and candidate_id not in generated_ids_set:
+                generated_ids_set.add(candidate_id)
+                boxes['id'] = candidate_id # Assign the new ID directly to the box dict
+                num_filled += 1
+                break # Found a unique ID
+            attempts += 1
+            # Safety break for unlikely infinite loop (though highly improbable with 12 chars)
+            if attempts > len(boxes) * 100 + 1000:
+                    raise RuntimeError(f"Failed to generate a unique ID after {attempts} attempts. Check ID length or existing IDs.")
+    if num_filled > 0:
+        pass
+        #print(f"Successfully filled {num_filled} missing or invalid box IDs.")
+    else:
+        pass
+        #print("No missing or invalid box IDs found.")
+    # The input dictionary 'data_input' has been modified in place
+    return data_input
+def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
+    """
+    Generates unique alphanumeric IDs for rows in a DataFrame column
+    where the value is missing (NaN, None) or an empty string.
+    Args:
+        df (pd.DataFrame): The input Pandas DataFrame.
+        column_name (str): The name of the column to check and fill (defaults to 'id').
+                           This column will be added if it doesn't exist.
+        length (int): The desired length of the generated IDs (defaults to 12).
+                      Cannot exceed the limits that guarantee uniqueness based
+                      on the number of IDs needed and character set size.
+    Returns:
+        pd.DataFrame: The DataFrame with missing/empty IDs filled in the specified column.
+                      Note: The function modifies the DataFrame in place.
+    """
+    # --- Input Validation ---
+    if not isinstance(df, pd.DataFrame):
+        raise TypeError("Input 'df' must be a Pandas DataFrame.")
+    if not isinstance(column_name, str) or not column_name:
+        raise ValueError("'column_name' must be a non-empty string.")
+    if not isinstance(length, int) or length <= 0:
+        raise ValueError("'length' must be a positive integer.")
+    # --- Ensure Column Exists ---
+    if column_name not in df.columns:
+        print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
+        df[column_name] = np.nan # Initialize with NaN
+    # --- Identify Rows Needing IDs ---
+    # Check for NaN, None, or empty strings ('')
+    # Convert to string temporarily for robust empty string check, handle potential errors
+    try:
+        df[column_name] = df[column_name].astype(str) #handles NaN/None conversion, .str.strip() removes whitespace
+        is_missing_or_empty = (
+            df[column_name].isna()
+            #| (df[column_name].astype(str).str.strip() == '')
+            #| (df[column_name] == "nan")
+            | (df[column_name].astype(str).str.len() != length)
+        )
+    except Exception as e:
+         # Fallback if conversion to string fails (e.g., column contains complex objects)
+         print(f"Warning: Could not perform reliable empty string check on column '{column_name}' due to data type issues. Checking for NaN/None only. Error: {e}")
+         is_missing_or_empty = df[column_name].isna()
+    rows_to_fill_index = df.index[is_missing_or_empty]
+    num_needed = len(rows_to_fill_index)
+    if num_needed == 0:
+        #print(f"No missing or empty values found in column '{column_name}'.")
+        return df
+    print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
+    # --- Get Existing IDs to Ensure Uniqueness ---
+    try:
+        # Get all non-missing, non-empty string values from the column
+        existing_ids = set(df.loc[~is_missing_or_empty, column_name].astype(str))
+    except Exception as e:
+        print(f"Warning: Could not reliably get all existing string IDs from column '{column_name}' due to data type issues. Uniqueness check might be less strict. Error: {e}")
+        # Fallback: Get only non-NaN IDs, potential type issues ignored
+        existing_ids = set(df.loc[df[column_name].notna(), column_name])
+    # --- Generate Unique IDs ---
+    character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
+    generated_ids_set = set() # Keep track of IDs generated *in this run*
+    new_ids_list = []      # Store the generated IDs in order
+    max_possible_ids = len(character_set) ** length
+    if num_needed > max_possible_ids:
+         raise ValueError(f"Cannot generate {num_needed} unique IDs with length {length}. Maximum possible is {max_possible_ids}.")
+    # Add a check for practical limits if needed, e.g., if num_needed is very close to max_possible_ids, generation could be slow.
+    #print(f"Generating {num_needed} unique IDs of length {length}...")
+    for i in range(num_needed):
+        attempts = 0
+        while True:
+            candidate_id = ''.join(random.choices(character_set, k=length))
+            # Check against *all* existing IDs and *newly* generated ones
+            if candidate_id not in existing_ids and candidate_id not in generated_ids_set:
+                generated_ids_set.add(candidate_id)
+                new_ids_list.append(candidate_id)
+                break # Found a unique ID
+            attempts += 1
+            if attempts > num_needed * 100 and attempts > 1000 : # Safety break for unlikely infinite loop
+                 raise RuntimeError(f"Failed to generate a unique ID after {attempts} attempts. Check length and character set or existing IDs.")
+        # Optional progress update for large numbers
+        if (i + 1) % 1000 == 0:
+            print(f"Generated {i+1}/{num_needed} IDs...")
+    # --- Assign New IDs ---
+    # Use the previously identified index to assign the new IDs correctly
+    df.loc[rows_to_fill_index, column_name] = new_ids_list
+    #print(f"Successfully filled {len(new_ids_list)} missing values in column '{column_name}'.")
+    # The DataFrame 'df' has been modified in place
+    return df
 def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
                                          image_paths:List[Image.Image],
                                          page_sizes:List[dict]=[]) -> List[dict]:
         page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
         review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df)
+    review_file_df = fill_missing_ids(review_file_df)
+    if 'id' not in review_file_df.columns:
+        review_file_df['id'] = ''
+        review_file_df['id'] = review_file_df['id'].astype(str)
     # Keep only necessary columns
+    review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "id", "text"]].drop_duplicates(subset=["image", "page", "xmin", "ymin", "xmax", "ymax", "label", "id"])
     # If colours are saved as list, convert to tuple
     review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)

tools/file_redaction.py CHANGED Viewed

@@ -21,7 +21,7 @@ from collections import defaultdict  # For efficient grouping
 from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
-from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
@@ -166,10 +166,10 @@ def choose_and_run_redactor(file_paths:List[str],
     # Ensure all_pages_decision_process_table is in correct format for downstream processes
     if isinstance(all_pages_decision_process_table,list):
-        if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"])
     elif isinstance(all_pages_decision_process_table, pd.DataFrame):
         if all_pages_decision_process_table.empty:
-            all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"])
      # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -211,6 +211,7 @@ def choose_and_run_redactor(file_paths:List[str],
     if latest_file_completed >= number_of_files:
         print("Completed last file")
         current_loop_page = 0
         if isinstance(out_message, list) and out_message:
@@ -383,7 +384,7 @@ def choose_and_run_redactor(file_paths:List[str],
     progress(0.5, desc="Extracting text and redacting document")
-    all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax",	"ymin",	"ymax",	"text"])
     all_line_level_ocr_results_df = pd.DataFrame()
     # Run through file loop, redact each file at a time
@@ -502,6 +503,8 @@ def choose_and_run_redactor(file_paths:List[str],
             if latest_file_completed != len(file_paths_list):
                 print("Completed file number:", str(latest_file_completed), "there are more files to do")
             # Save redacted file
             if pii_identification_method != no_redaction_option:
                 if is_pdf(file_path) == False:
@@ -512,7 +515,7 @@ def choose_and_run_redactor(file_paths:List[str],
                     #
                 else:
                     out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
-                    print("saving redacted pdf file:", out_redacted_pdf_file_path)
                     pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
                 out_file_paths.append(out_redacted_pdf_file_path)
@@ -522,7 +525,6 @@ def choose_and_run_redactor(file_paths:List[str],
             else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
             ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
             all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
@@ -539,6 +541,8 @@ def choose_and_run_redactor(file_paths:List[str],
             annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
             # Save the gradio_annotation_boxes to a review csv file
             review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
@@ -838,7 +842,10 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
     if hasattr(annot, 'text') and annot.text:
         img_annotation_box["text"] = str(annot.text)
     else:
-        img_annotation_box["text"] = ""
     return img_annotation_box, rect
@@ -953,6 +960,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
         page_annotations = page_annotations["boxes"]
     for annot in page_annotations:
         # Check if an Image recogniser result, or a Gradio annotation object
         if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
@@ -960,6 +971,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
             # Should already be in correct format if img_annotator_box is an input
             if isinstance(annot, dict):
                 img_annotation_box = annot
                 box_coordinates = (img_annotation_box['xmin'], img_annotation_box['ymin'], img_annotation_box['xmax'], img_annotation_box['ymax'])
@@ -1004,6 +1016,8 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
             img_annotation_box, rect = convert_pikepdf_annotations_to_result_annotation_box(page, annot, image, convert_pikepdf_to_pymupdf_coords, page_sizes_df, image_dimensions=image_dimensions)
             #print("image_dimensions:", image_dimensions)
             #print("annot:", annot)
@@ -1155,7 +1169,7 @@ def redact_image_pdf(file_path:str,
                      page_break_return:bool=False,
                      annotations_all_pages:List=[],
                      all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(),
-                     all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"]),
                      pymupdf_doc:Document = [],
                      pii_identification_method:str="Local",
                      comprehend_query_number:int=0,
@@ -1490,11 +1504,15 @@ def redact_image_pdf(file_path:str,
                 'start': result.start,
                 'end': result.end,
                 'score': result.score,
-                'page': reported_page_number
             } for result in page_merged_redaction_bboxes])
             all_pages_decision_process_table_list.append(decision_process_table)
             # Convert to DataFrame and add to ongoing logging table
             line_level_ocr_results_df = pd.DataFrame([{
                 'page': reported_page_number,
@@ -1739,12 +1757,16 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
         analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
         # Convert the new columns to integers (if needed)
-        analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
         analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
         analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
         analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
         analysed_bounding_boxes_df_new['page'] = page_num + 1
         decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
     return decision_process_table
@@ -1786,7 +1808,7 @@ def redact_text_pdf(
     page_break_return: bool = False,  # Flag to indicate if a page break should be returned
     annotations_all_pages: List[dict] = [],  # List of annotations across all pages
     all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(),  # DataFrame for OCR results
-    all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax",	"ymin",	"ymax",	"text"]),  # DataFrame for decision process table
     pymupdf_doc: List = [],  # List of PyMuPDF documents
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
@@ -1967,7 +1989,7 @@ def redact_text_pdf(
                     pymupdf_page, page_image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_redaction_annotations_on_page, image_path, redact_whole_page=redact_whole_page, convert_pikepdf_to_pymupdf_coords=True, original_cropbox=original_cropboxes[page_no], page_sizes_df=page_sizes_df)
                     # Create decision process table
-                    page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
                     if not page_decision_process_table.empty:
                         all_pages_decision_process_table_list.append(page_decision_process_table)
@@ -2035,7 +2057,7 @@ def redact_text_pdf(
             return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
-    # Write decision logs
     all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
     all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)

 from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
+from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
     # Ensure all_pages_decision_process_table is in correct format for downstream processes
     if isinstance(all_pages_decision_process_table,list):
+        if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
     elif isinstance(all_pages_decision_process_table, pd.DataFrame):
         if all_pages_decision_process_table.empty:
+            all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
      # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
     if latest_file_completed >= number_of_files:
         print("Completed last file")
+        progress(0.95, "Completed last file, performing final checks")
         current_loop_page = 0
         if isinstance(out_message, list) and out_message:
     progress(0.5, desc="Extracting text and redacting document")
+    all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
     all_line_level_ocr_results_df = pd.DataFrame()
     # Run through file loop, redact each file at a time
             if latest_file_completed != len(file_paths_list):
                 print("Completed file number:", str(latest_file_completed), "there are more files to do")
+            progress(0.9, "Saving redacted PDF file")
             # Save redacted file
             if pii_identification_method != no_redaction_option:
                 if is_pdf(file_path) == False:
                     #
                 else:
                     out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
+                    print("Saving redacted PDF file:", out_redacted_pdf_file_path)
                     pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
                 out_file_paths.append(out_redacted_pdf_file_path)
             else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
             ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
             all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
             annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
             # Save the gradio_annotation_boxes to a review csv file
             review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
     if hasattr(annot, 'text') and annot.text:
         img_annotation_box["text"] = str(annot.text)
     else:
+        img_annotation_box["text"] = ""
+    # Assign an id
+    img_annotation_box = fill_missing_box_ids(img_annotation_box)
     return img_annotation_box, rect
         page_annotations = page_annotations["boxes"]
     for annot in page_annotations:
         # Check if an Image recogniser result, or a Gradio annotation object
         if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
             # Should already be in correct format if img_annotator_box is an input
             if isinstance(annot, dict):
+                annot = fill_missing_box_ids(annot)
                 img_annotation_box = annot
                 box_coordinates = (img_annotation_box['xmin'], img_annotation_box['ymin'], img_annotation_box['xmax'], img_annotation_box['ymax'])
             img_annotation_box, rect = convert_pikepdf_annotations_to_result_annotation_box(page, annot, image, convert_pikepdf_to_pymupdf_coords, page_sizes_df, image_dimensions=image_dimensions)
+            img_annotation_box = fill_missing_box_ids(img_annotation_box)
             #print("image_dimensions:", image_dimensions)
             #print("annot:", annot)
                      page_break_return:bool=False,
                      annotations_all_pages:List=[],
                      all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(),
+                     all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
                      pymupdf_doc:Document = [],
                      pii_identification_method:str="Local",
                      comprehend_query_number:int=0,
                 'start': result.start,
                 'end': result.end,
                 'score': result.score,
+                'page': reported_page_number
             } for result in page_merged_redaction_bboxes])
             all_pages_decision_process_table_list.append(decision_process_table)
+            decision_process_table = fill_missing_ids(decision_process_table)
+            #decision_process_table.to_csv("output/decision_process_table_with_ids.csv")
             # Convert to DataFrame and add to ongoing logging table
             line_level_ocr_results_df = pd.DataFrame([{
                 'page': reported_page_number,
         analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
         # Convert the new columns to integers (if needed)
+        #analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
         analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
         analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
         analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
         analysed_bounding_boxes_df_new['page'] = page_num + 1
+        #analysed_bounding_boxes_df_new = fill_missing_ids(analysed_bounding_boxes_df_new)
+        analysed_bounding_boxes_df_new.to_csv("output/analysed_bounding_boxes_df_new_with_ids.csv")
         decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
     return decision_process_table
     page_break_return: bool = False,  # Flag to indicate if a page break should be returned
     annotations_all_pages: List[dict] = [],  # List of annotations across all pages
     all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(),  # DataFrame for OCR results
+    all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax",	"text", "id"]),  # DataFrame for decision process table
     pymupdf_doc: List = [],  # List of PyMuPDF documents
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
                     pymupdf_page, page_image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_redaction_annotations_on_page, image_path, redact_whole_page=redact_whole_page, convert_pikepdf_to_pymupdf_coords=True, original_cropbox=original_cropboxes[page_no], page_sizes_df=page_sizes_df)
                     # Create decision process table
+                    page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
                     if not page_decision_process_table.empty:
                         all_pages_decision_process_table_list.append(page_decision_process_table)
             return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
+    # Write all page outputs
     all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
     all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)

tools/redaction_review.py CHANGED Viewed

@@ -15,7 +15,7 @@ import pymupdf
 from PIL import ImageDraw, Image
 from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
-from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes
 from tools.helper_functions import get_file_name_without_type,  detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
@@ -99,6 +99,8 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
     review_dataframe = review_df
     try:
         review_dataframe = convert_annotation_json_to_review_df(page_image_annotator_object, review_df, page_sizes)
         recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
@@ -114,13 +116,13 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
         page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
         page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
-        recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
-        recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
     except Exception as e:
         print("Could not extract recogniser information:", e)
-        recogniser_dataframe_out = recogniser_dataframe_base[["page", "label", "text"]]
         label_choices = review_dataframe["label"].astype(str).unique().tolist()
         text_choices = review_dataframe["text"].astype(str).unique().tolist()
@@ -151,7 +153,7 @@ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData,
         review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
-        recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
         recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
         recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
@@ -179,15 +181,32 @@ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
     '''
     out_image_annotations_state = current_image_annotations_state
     out_current_page_annotator = current_page_annotator
     if not review_df.empty:
         out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
-        print("out_image_annotations_state[current_page-1]:", out_image_annotations_state[current_page-1])
-        if previous_page == current_page:
-            out_current_page_annotator = out_image_annotations_state[current_page-1]
     return out_current_page_annotator, out_image_annotations_state
@@ -206,24 +225,30 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
     backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
     if not selected_rows_df.empty and not review_df.empty:
-        # Ensure selected_rows_df has the same relevant columns
-        selected_subset = selected_rows_df[['label', 'page', 'text']].drop_duplicates(subset=['label', 'page', 'text'])
-        # Perform anti-join using merge with an indicator column
-        merged_df = review_df.merge(selected_subset, on=['label', 'page', 'text'], how='left', indicator=True)
-        # Keep only the rows that do not have a match in selected_rows_df
         out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
         out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
-        out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
     # Either there is nothing left in the selection dataframe, or the review dataframe
     else:
         out_review_df = review_df
         out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
         out_image_annotations_state = image_annotations_state
     return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
@@ -234,7 +259,7 @@ def update_annotator_object_and_filter_df(
                     recogniser_entities_dropdown_value:str="ALL",
                     page_dropdown_value:str="ALL",
                     text_dropdown_value:str="ALL",
-                    recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, show_search='filter', max_height=400),
                     zoom:int=100,
                     review_df:pd.DataFrame=[],
                     page_sizes:List[dict]=[],
@@ -244,6 +269,8 @@ def update_annotator_object_and_filter_df(
     Update a gradio_image_annotation object with new annotation data.
     '''
     zoom_str = str(zoom) + '%'
     if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
@@ -295,10 +322,7 @@ def update_annotator_object_and_filter_df(
         replaced_image_path = current_image_path
-    if review_df.empty: review_df = pd.DataFrame(columns=["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"])
-    ##
     review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
     # Update dropdowns and review selection dataframe with the updated annotator object
@@ -313,19 +337,27 @@ def update_annotator_object_and_filter_df(
     images_list[page_num_reported_zero_indexed] = replaced_image_path
     all_image_annotations[page_num_reported_zero_indexed]['image'] = replaced_image_path
     # Multiply out image_annotation coordinates from relative to absolute if necessary
     all_image_annotations_df = convert_annotation_data_to_dataframe(all_image_annotations)
     all_image_annotations_df = multiply_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
     all_image_annotations = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
     # Remove blank duplicate entries
     all_image_annotations = remove_duplicate_images_with_blank_boxes(all_image_annotations)
     current_page_image_annotator_object = all_image_annotations[page_num_reported_zero_indexed]
     page_number_reported_gradio = gr.Number(label = "Current page", value=page_num_reported, precision=0)
     ###
@@ -537,7 +569,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                 page_sizes_df = pd.DataFrame(page_sizes)
                 page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
-                for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
                     image_loc = all_image_annotations[i]['image']
@@ -561,7 +593,9 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                     pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1], page_sizes_df= page_sizes_df) # image=image,
             else:
                 print("File type not recognised.")
             #try:
             if pdf_doc:
                 out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
@@ -579,7 +613,14 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
         try:
             #print("Saving review file.")
-            review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image",	"page",	"label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image",	"page",	"text",	"label","color", "xmin", "ymin", "xmax", "ymax"])
             out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
             review_df.to_csv(out_review_file_file_path, index=None)
@@ -752,8 +793,9 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         row_value_label = evt.row_value[1] # This is the label number value
         row_value_text = evt.row_value[2] # This is the text number value
-        row_value_df = pd.DataFrame(data={"page":[row_value_page], "label":[row_value_label], "text":[row_value_text]})
         return row_value_page, row_value_df
@@ -787,25 +829,61 @@ def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
         return row_value_page, row_value_df
-def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, colour:tuple=(0,0,255)):
     '''
     Update the colour of a single redaction box based on the values in a selection row
     '''
     colour_tuple =  str(tuple(colour))
-    if "color" not in review_df.columns: review_df["color"] = None
     # Reset existing highlight colours
-    review_df.loc[review_df["color"]==colour_tuple, "color"] = review_df.loc[review_df["color"]==colour_tuple, "color"].apply(lambda _: '(0, 0, 0)')
-    review_df = review_df.merge(redaction_row_selection, on=["page", "label", "text"], indicator=True, how="left")
-    review_df.loc[review_df["_merge"]=="both", "color"] =  review_df.loc[review_df["_merge"] == "both", "color"].apply(lambda _: '(0, 0, 255)')
     review_df.drop("_merge", axis=1, inplace=True)
-    review_df.to_csv(OUTPUT_FOLDER + "review_df_in_update_selected_review.csv")
-    return review_df
 def update_boxes_color(images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)):
     """

 from PIL import ImageDraw, Image
 from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
+from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes
 from tools.helper_functions import get_file_name_without_type,  detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
     review_dataframe = review_df
     try:
+        #print("converting annotation json in get_filtered_recogniser...")
         review_dataframe = convert_annotation_json_to_review_df(page_image_annotator_object, review_df, page_sizes)
         recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
         page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
         page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
+        recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text", "id"]], show_search="filter", col_count=(4, "fixed"), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
+        recogniser_dataframe_out = review_dataframe[["page", "label", "text", "id"]]
     except Exception as e:
         print("Could not extract recogniser information:", e)
+        recogniser_dataframe_out = recogniser_dataframe_base[["page", "label", "text", "id"]]
         label_choices = review_dataframe["label"].astype(str).unique().tolist()
         text_choices = review_dataframe["text"].astype(str).unique().tolist()
         review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
+        recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text", "id"]], show_search="filter", col_count=(4, "fixed"), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
         recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
         recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
     '''
     out_image_annotations_state = current_image_annotations_state
     out_current_page_annotator = current_page_annotator
+    gradio_annotator_current_page_number = current_page
     if not review_df.empty:
+        #print("review_df just before convert_review_df:", review_df)
+        # First, check that the image on the current page is valid, replace with what exists in page_sizes object if not
+        if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
+        # Check bounding values for current page and page max
+        if gradio_annotator_current_page_number > 0: page_num_reported = gradio_annotator_current_page_number
+        elif gradio_annotator_current_page_number == 0: page_num_reported = 1 # minimum possible reported page is 1
+        else:
+            gradio_annotator_current_page_number = 0
+            page_num_reported = 1
+        # Ensure page displayed can't exceed number of pages in document
+        page_max_reported = len(out_image_annotations_state)
+        if page_num_reported > page_max_reported: page_num_reported = page_max_reported
+        page_num_reported_zero_indexed = page_num_reported - 1
         out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
+        page_image_annotator_object, out_image_annotations_state = replace_images_in_image_annotation_object(out_image_annotations_state, out_image_annotations_state[page_num_reported_zero_indexed], page_sizes, page_num_reported)
+        out_image_annotations_state[page_num_reported_zero_indexed] = page_image_annotator_object
+        out_current_page_annotator = out_image_annotations_state[page_num_reported_zero_indexed]
     return out_current_page_annotator, out_image_annotations_state
     backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
     if not selected_rows_df.empty and not review_df.empty:
+        use_id = (
+            "id" in selected_rows_df.columns
+            and "id" in review_df.columns
+            and not selected_rows_df["id"].isnull().all()
+            and not review_df["id"].isnull().all()
+        )
+        selected_merge_cols = ["id"] if use_id else ["label", "page", "text"]
+        # Subset and drop duplicates from selected_rows_df
+        selected_subset = selected_rows_df[selected_merge_cols].drop_duplicates(subset=selected_merge_cols)
+        # Perform anti-join using merge with indicator
+        merged_df = review_df.merge(selected_subset, on=selected_merge_cols, how='left', indicator=True)
         out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
         out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
+        out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text", "id"]]
     # Either there is nothing left in the selection dataframe, or the review dataframe
     else:
         out_review_df = review_df
         out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
         out_image_annotations_state = image_annotations_state
     return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
                     recogniser_entities_dropdown_value:str="ALL",
                     page_dropdown_value:str="ALL",
                     text_dropdown_value:str="ALL",
+                    recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, show_search='filter', max_height=400, static_columns=[0,1,2,3]),
                     zoom:int=100,
                     review_df:pd.DataFrame=[],
                     page_sizes:List[dict]=[],
     Update a gradio_image_annotation object with new annotation data.
     '''
     zoom_str = str(zoom) + '%'
+    #print("all_image_annotations at start of update_annotator_object_and_filter_df[-1]:", all_image_annotations[-1])
     if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
         replaced_image_path = current_image_path
+    if review_df.empty: review_df = pd.DataFrame(columns=["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text", "id"])
     review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
     # Update dropdowns and review selection dataframe with the updated annotator object
     images_list[page_num_reported_zero_indexed] = replaced_image_path
     all_image_annotations[page_num_reported_zero_indexed]['image'] = replaced_image_path
     # Multiply out image_annotation coordinates from relative to absolute if necessary
     all_image_annotations_df = convert_annotation_data_to_dataframe(all_image_annotations)
     all_image_annotations_df = multiply_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
+    #print("all_image_annotations_df[-1] just before creating annotation dicts:", all_image_annotations_df.iloc[-1, :])
     all_image_annotations = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
+    #print("all_image_annotations[-1] after creating annotation dicts:", all_image_annotations[-1])
     # Remove blank duplicate entries
     all_image_annotations = remove_duplicate_images_with_blank_boxes(all_image_annotations)
     current_page_image_annotator_object = all_image_annotations[page_num_reported_zero_indexed]
+    #print("current_page_image_annotator_object that goes into annotator object:", current_page_image_annotator_object)
     page_number_reported_gradio = gr.Number(label = "Current page", value=page_num_reported, precision=0)
     ###
                 page_sizes_df = pd.DataFrame(page_sizes)
                 page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
+                for i in progress.tqdm(range(0, number_of_pages), desc="Saving redacted pages to file", unit = "pages"):
                     image_loc = all_image_annotations[i]['image']
                     pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1], page_sizes_df= page_sizes_df) # image=image,
             else:
                 print("File type not recognised.")
+            progress(0.9, "Saving output files")
             #try:
             if pdf_doc:
                 out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
         try:
             #print("Saving review file.")
+            review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)
+            page_sizes_df = pd.DataFrame(page_sizes)
+            page_sizes_df .loc[:, "page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+            review_df = divide_coordinates_by_page_sizes(review_df, page_sizes_df)
+            review_df = review_df[["image",	"page",	"label","color", "xmin", "ymin", "xmax", "ymax", "text", "id"]]
             out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
             review_df.to_csv(out_review_file_file_path, index=None)
         row_value_page = evt.row_value[0] # This is the page number value
         row_value_label = evt.row_value[1] # This is the label number value
         row_value_text = evt.row_value[2] # This is the text number value
+        row_value_id = evt.row_value[3] # This is the text number value
+        row_value_df = pd.DataFrame(data={"page":[row_value_page], "label":[row_value_label], "text":[row_value_text], "id":[row_value_id]})
         return row_value_page, row_value_df
         return row_value_page, row_value_df
+def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, previous_id:str="", previous_colour:str='(0, 0, 0)', page_sizes:List[dict]=[], output_folder:str=OUTPUT_FOLDER, colour:str='(1, 0, 255)'):
     '''
     Update the colour of a single redaction box based on the values in a selection row
     '''
     colour_tuple =  str(tuple(colour))
+    if "color" not in review_df.columns: review_df["color"] = '(0, 0, 0)'
+    if "id" not in review_df.columns:
+        review_df = fill_missing_ids(review_df)
     # Reset existing highlight colours
+    review_df.loc[review_df["id"]==previous_id, "color"] = review_df.loc[review_df["id"]==previous_id, "color"].apply(lambda _: previous_colour)
+    review_df.loc[review_df["color"].astype(str)==colour, "color"] = review_df.loc[review_df["color"].astype(str)==colour, "color"].apply(lambda _: '(0, 0, 0)')
+    if not redaction_row_selection.empty and not review_df.empty:
+        use_id = (
+            "id" in redaction_row_selection.columns
+            and "id" in review_df.columns
+            and not redaction_row_selection["id"].isnull().all()
+            and not review_df["id"].isnull().all()
+        )
+        selected_merge_cols = ["id"] if use_id else ["label", "page", "text"]
+        review_df = review_df.merge(redaction_row_selection[selected_merge_cols], on=selected_merge_cols, indicator=True, how="left")
+    if "_merge" in review_df.columns:
+        filtered_reviews = review_df.loc[review_df["_merge"]=="both"]
+    else:
+        filtered_reviews = pd.DataFrame()
+    if not filtered_reviews.empty:
+        previous_colour = str(filtered_reviews["color"].values[0])
+        previous_id = filtered_reviews["id"].values[0]
+        review_df.loc[review_df["_merge"]=="both", "color"] =  review_df.loc[review_df["_merge"] == "both", "color"].apply(lambda _: colour)
+    else:
+        # Handle the case where no rows match the condition
+        print("No reviews found with _merge == 'both'")
+        previous_colour = '(0, 0, 0)'
+        review_df.loc[review_df["color"]==colour, "color"] = previous_colour
+        previous_id =''
     review_df.drop("_merge", axis=1, inplace=True)
+    # Ensure that all output coordinates are in proportional size
+    #page_sizes_df = pd.DataFrame(page_sizes)
+    #page_sizes_df .loc[:, "page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
+    #print("review_df before divide:", review_df)
+    #print("page_sizes_df before divide:", page_sizes_df)
+    #review_df = divide_coordinates_by_page_sizes(review_df, page_sizes_df)
+    #print("review_df after divide:", review_df)
+    review_df = review_df[["image", "page", "label", "color", "xmin","ymin", "xmax", "ymax", "text", "id"]]
+    return review_df, previous_id, previous_colour
 def update_boxes_color(images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)):
     """

tools/textract_batch_call.py CHANGED Viewed

@@ -164,7 +164,7 @@ def analyse_document_with_textract_api(
         }])
         # File path
-        log_file_path = os.path.join(local_output_dir, "textract_job_log_files.csv")
         # Check if file exists
         file_exists = os.path.exists(log_file_path)
@@ -444,18 +444,16 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
     '''
     Load in a dataframe of jobs previous submitted to the Textract API service.
     '''
     job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
     # Initialize boto3 clients
     session = boto3.Session(region_name=aws_region)
     s3_client = session.client('s3')
-    local_output_path = f'{load_local_jobs_loc}/textract_job_log_files.csv'
     if load_s3_jobs == 'True':
-        s3_output_key = f'{load_s3_jobs_loc}/textract_job_log_files.csv'
         try:
             s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
@@ -523,4 +521,10 @@ def download_textract_output(job_id:str,
         s3_client.download_file(output_bucket, output_file_key, local_file_path)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
-        print(f"Error downloading file: {e}")

         }])
         # File path
+        log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
         # Check if file exists
         file_exists = os.path.exists(log_file_path)
     '''
     Load in a dataframe of jobs previous submitted to the Textract API service.
     '''
     job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
     # Initialize boto3 clients
     session = boto3.Session(region_name=aws_region)
     s3_client = session.client('s3')
+    local_output_path = f'{load_local_jobs_loc}/textract_document_jobs.csv'
     if load_s3_jobs == 'True':
+        s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
         try:
             s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
         s3_client.download_file(output_bucket, output_file_key, local_file_path)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
+        print(f"Error downloading file: {e}")
+def check_textract_outputs_exist(textract_output_found_checkbox):
+        if textract_output_found_checkbox == True:
+            print("Textract outputs found")
+            return
+        else: raise Exception("Relevant Tetract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")