Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on Aug 27

Commit

bcf1a65

unverified ·

2 Parent(s): 58b064b 80268bb

Merge pull request #56 from seanpedrick-case/review_updates

Browse files

Updated review functions to update with manual reviews. Minor package update

Files changed (5) hide show

app.py +30 -15
pyproject.toml +1 -1
requirements.txt +1 -1
tools/find_duplicate_pages.py +4 -4
tools/redaction_review.py +58 -25

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from tools.helper_functions import put_columns_in_df, get_connection_params, rev
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
-from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
 from tools.data_anonymise import anonymise_files_with_open_text
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
@@ -756,7 +756,7 @@ with app:
     success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
     success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df], scroll_to_output=True)
-    # Save current page redactions
     update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
     success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
     success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df])
@@ -772,6 +772,8 @@ with app:
     # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
     recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
         success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_df, selected_entity_id, selected_entity_colour], outputs=[review_file_df, selected_entity_id, selected_entity_colour]).\
         success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
         success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
@@ -779,22 +781,28 @@ with app:
     reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
-    # Exclude current selection from annotator and outputs
     # Exclude only selected row
-    exclude_selected_row_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_df, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Exclude all items with same text as selected row
-    exclude_text_with_same_as_selected_row_btn.click(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
-    success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Exclude everything visible in table
-    exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
@@ -837,6 +845,8 @@ with app:
     # Clicking on a cell in the redact items table will take you to that page
     all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
         success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
         success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
@@ -844,27 +854,32 @@ with app:
     reset_dropdowns_btn_new.click(reset_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
     # Reset redaction table following filtering
     reset_ocr_with_words_df_btn.click(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
     # Redact current selection
-    redact_selected_row_btn.click(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[selected_entity_dataframe_row_redact,  all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
     # Redact all items with same text as selected row
-    redact_text_with_same_as_selected_row_btn.click(get_all_rows_with_same_text_redact, inputs=[all_page_line_level_ocr_results_with_words_df_base, selected_entity_dataframe_row_text_redact], outputs=[to_redact_dataframe_same_text]).\
-    success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[to_redact_dataframe_same_text,  all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state,recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
-    # Redact everything visible in table
-    redact_selected_btn.click(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[all_page_line_level_ocr_results_with_words_df,  all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
-        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
-        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
-        success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
     # Undo last redaction action
     undo_last_redact_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\

 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
+from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact,get_and_merge_current_page_annotations
 from tools.data_anonymise import anonymise_files_with_open_text
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
     success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
     success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df], scroll_to_output=True)
+    # Save current page manual redactions
     update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
     success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
     success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df])
     # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
     recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
+        success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
         success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_df, selected_entity_id, selected_entity_colour], outputs=[review_file_df, selected_entity_id, selected_entity_colour]).\
         success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
         success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
     reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
+    ### Exclude current selection from annotator and outputs
     # Exclude only selected row
+    exclude_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
+        success(exclude_selected_items_from_redaction, inputs=[review_file_df, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Exclude all items with same text as selected row
+    exclude_text_with_same_as_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
+        success(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
+        success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Exclude everything visible in table
+    exclude_selected_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
+        success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     # Clicking on a cell in the redact items table will take you to that page
     all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
+        success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
         success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
         success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
     reset_dropdowns_btn_new.click(reset_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
+    # Redact everything visible in table
+    redact_selected_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[all_page_line_level_ocr_results_with_words_df,  all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
+        success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
+        success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
+        success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
     # Reset redaction table following filtering
     reset_ocr_with_words_df_btn.click(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
     # Redact current selection
+    redact_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[selected_entity_dataframe_row_redact,  all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
     # Redact all items with same text as selected row
+    redact_text_with_same_as_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+    success(get_all_rows_with_same_text_redact, inputs=[all_page_line_level_ocr_results_with_words_df_base, selected_entity_dataframe_row_text_redact], outputs=[to_redact_dataframe_same_text]).\
+    success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[to_redact_dataframe_same_text,  all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state,recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
     # Undo last redaction action
     undo_last_redact_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\

pyproject.toml CHANGED Viewed

@@ -23,7 +23,7 @@ dependencies = [
     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
-    "gradio==5.43.1",
     "boto3==1.40.10",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",

     "spacy==3.8.7",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "gradio==5.44.0",
     "boto3==1.40.10",
     "pyarrow==21.0.0",
     "openpyxl==3.1.5",

requirements.txt CHANGED Viewed

@@ -10,7 +10,7 @@ pandas==2.3.1
 scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-gradio==5.43.1
 boto3==1.40.10
 pyarrow==21.0.0
 openpyxl==3.1.5

 scikit-learn==1.7.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.44.0
 boto3==1.40.10
 pyarrow==21.0.0
 openpyxl==3.1.5

tools/find_duplicate_pages.py CHANGED Viewed

@@ -570,7 +570,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
     final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
     output_paths.append(str(similarity_file_output_path))
-    print(f"Main results saved to {similarity_file_output_path}")
     # 2. Save per-file redaction lists
     # Use 'Page2_File' as the source of duplicate content
@@ -663,7 +663,7 @@ def find_consecutive_sequence_matches(
         A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
         consecutive match, or an empty DataFrame if no match is found.
     """
-    print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
     # Step 1: Isolate the data for each file
     search_df = df_filtered[df_filtered['file'] == search_file_name]
@@ -693,7 +693,7 @@ def find_consecutive_sequence_matches(
         # Step 4: If the window matches the query with or without punctuation on end
         if _sequences_match(query_tokens, window):
-            print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
             # Get the global indices for this entire matching block
             matching_reference_indices = reference_indices[i : i + query_len]
@@ -795,7 +795,7 @@ def identify_similar_text_sequences(
     progress(0.7, desc="Aggregating results based on matching strategy")
     if greedy_match or min_consecutive_pages > 1:
-        print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
         # Sort the dataframe to ensure consecutive pages are adjacent
         similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()

     final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
     output_paths.append(str(similarity_file_output_path))
+    #print(f"Main results saved to {similarity_file_output_path}")
     # 2. Save per-file redaction lists
     # Use 'Page2_File' as the source of duplicate content
         A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
         consecutive match, or an empty DataFrame if no match is found.
     """
+    #print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
     # Step 1: Isolate the data for each file
     search_df = df_filtered[df_filtered['file'] == search_file_name]
         # Step 4: If the window matches the query with or without punctuation on end
         if _sequences_match(query_tokens, window):
+            #print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
             # Get the global indices for this entire matching block
             matching_reference_indices = reference_indices[i : i + query_len]
     progress(0.7, desc="Aggregating results based on matching strategy")
     if greedy_match or min_consecutive_pages > 1:
+        #print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
         # Sort the dataframe to ensure consecutive pages are adjacent
         similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()

tools/redaction_review.py CHANGED Viewed

@@ -466,6 +466,33 @@ def _merge_horizontally_adjacent_boxes(
     return merged_df
 def create_annotation_objects_from_filtered_ocr_results_with_words(
     filtered_ocr_results_with_words_df: pd.DataFrame,
     ocr_results_with_words_df_base: pd.DataFrame,
@@ -475,6 +502,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     existing_recogniser_entity_df: pd.DataFrame,
     redaction_label:str = "Redaction",
     colour_label:str = '(0, 0, 0)',
     progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
@@ -493,8 +521,17 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     """
     # Validate colour_label: must be a 3-number tuple with each value in [0, 255]
-    # If invalid, fallback to '(0, 0, 0,)' as requested
-    fallback_colour = '(0, 0, 0,)'
     try:
         valid = False
         if isinstance(colour_label, str):
@@ -507,7 +544,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
         elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
             r_val, g_val, b_val = colour_label
             if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(0 <= v <= 255 for v in (r_val, g_val, b_val)):
-                colour_label = f'({r_val}, {g_val}, {b_val},)'
                 valid = True
         if not valid:
             colour_label = fallback_colour
@@ -573,7 +610,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
                     indicator=True
                 )
                 unique_new_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
-                #unique_new_df = new_annotations_df
             print(f"Found {len(unique_new_df)} new unique annotations to add.")
             gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
@@ -606,8 +642,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
     # 3. Sort the DataFrame based on this new custom order.
-    merged_df = merged_df.sort_values('image')
     final_annotations_list = list()
     box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
@@ -616,11 +651,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     # DataFrame's current order, which we have just manually set. This is slightly
     # more efficient than letting it sort again.
     for image_path, group in merged_df.groupby('image', sort=False, observed=False):
-        # The progress.tqdm wrapper can be added back around the groupby object as you had it.
-        # for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
-        # Check if the group has actual annotations. iloc[0] is safe because even pages
-        # without annotations will have one row with NaN values from the merge.
         if pd.isna(group.iloc[0].get('id')):
             boxes = list()
         else:
@@ -636,6 +668,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
     progress(1.0, desc="Completed annotation processing")
     return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
 def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
@@ -844,7 +878,6 @@ def update_annotator_object_and_filter_df(
                      review_df['page'] = pd.to_numeric(review_df['page'], errors='coerce').fillna(-1).astype(int)
                      review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
             except Exception as e:
                  print(f"Error during image path replacement for page {page_num_reported}: {e}")
     else:
@@ -857,7 +890,7 @@ def update_annotator_object_and_filter_df(
     else:
         page_sizes = list() # Ensure page_sizes is a list if df is empty
-    # --- OPTIMIZATION: Prepare data *only* for the current page for display ---
     current_page_image_annotator_object = None
     if len(all_image_annotations) > page_num_reported_zero_indexed:
         page_data_for_display = all_image_annotations[page_num_reported_zero_indexed]
@@ -984,11 +1017,11 @@ def update_all_page_annotation_object_based_on_previous_page(
     Overwrite image annotations on the page we are moving from with modifications.
     '''
-    if current_page > len(page_sizes):
-        raise Warning("Selected page is higher than last page number")
-    elif current_page <= 0:
-        raise Warning("Selected page is lower than first page")
     previous_page_zero_index = previous_page -1
@@ -1000,6 +1033,8 @@ def update_all_page_annotation_object_based_on_previous_page(
     if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
     else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
     return all_image_annotations, current_page, current_page
 def apply_redactions_to_review_df_and_files(page_image_annotator_object:AnnotatedImageData,
@@ -1179,15 +1214,9 @@ def update_all_entity_df_dropdowns(df:pd.DataFrame, label_dropdown_value:str, pa
     filtered_df = df.copy()
-    # Apply filtering based on dropdown selections
-    # if not "ALL" in page_dropdown_value:
-    #     filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
-    # if not "ALL" in text_dropdown_value:
-    #     filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
-    # if not "ALL" in label_dropdown_value:
-    #     filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
     recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
     recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
@@ -1224,6 +1253,10 @@ def update_entities_df_recogniser_entities(choice:str, df:pd.DataFrame, page_dro
     if not "ALL" in choice:
         filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
     recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
     recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)

     return merged_df
+def get_and_merge_current_page_annotations(
+        page_sizes: List[Dict],
+        annotate_current_page: int,
+        existing_annotations_list: List[Dict],
+        existing_annotations_df: pd.DataFrame
+    ) -> pd.DataFrame:
+        """
+        Function to extract and merge annotations for the current page
+        into the main existing_annotations_df.
+        """
+        current_page_image = page_sizes[annotate_current_page - 1]["image_path"]
+        existing_annotations_current_page = [
+            item for item in existing_annotations_list if item["image"] == current_page_image
+        ]
+        current_page_annotations_df = convert_annotation_data_to_dataframe(existing_annotations_current_page)
+        # Concatenate and clean, ensuring no duplicates and sorted order
+        updated_df = pd.concat(
+            [existing_annotations_df, current_page_annotations_df], ignore_index=True
+        ).sort_values(by=["page", "xmin", "ymin"]).drop_duplicates(
+            subset=["id"], keep="first"
+        )
+        return updated_df
 def create_annotation_objects_from_filtered_ocr_results_with_words(
     filtered_ocr_results_with_words_df: pd.DataFrame,
     ocr_results_with_words_df_base: pd.DataFrame,
     existing_recogniser_entity_df: pd.DataFrame,
     redaction_label:str = "Redaction",
     colour_label:str = '(0, 0, 0)',
+    annotate_current_page:int = 1,
     progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     """
     This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
     """
     # Validate colour_label: must be a 3-number tuple with each value in [0, 255]
+    # If invalid, fallback to '(0, 0, 0)' as requested
+    fallback_colour = '(0, 0, 0)'
+    existing_annotations_df = get_and_merge_current_page_annotations(
+        page_sizes,
+        annotate_current_page,
+        existing_annotations_list,
+        existing_annotations_df
+    )
     try:
         valid = False
         if isinstance(colour_label, str):
         elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
             r_val, g_val, b_val = colour_label
             if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(0 <= v <= 255 for v in (r_val, g_val, b_val)):
+                colour_label = f'({r_val}, {g_val}, {b_val})'
                 valid = True
         if not valid:
             colour_label = fallback_colour
                     indicator=True
                 )
                 unique_new_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
             print(f"Found {len(unique_new_df)} new unique annotations to add.")
             gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
     merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
     # 3. Sort the DataFrame based on this new custom order.
+    merged_df = merged_df.sort_values('image')
     final_annotations_list = list()
     box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
     # DataFrame's current order, which we have just manually set. This is slightly
     # more efficient than letting it sort again.
     for image_path, group in merged_df.groupby('image', sort=False, observed=False):
+        # Check if the group has actual annotations.
         if pd.isna(group.iloc[0].get('id')):
             boxes = list()
         else:
     progress(1.0, desc="Completed annotation processing")
+    print("final_annotations_list:", final_annotations_list)
     return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
 def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
                      review_df['page'] = pd.to_numeric(review_df['page'], errors='coerce').fillna(-1).astype(int)
                      review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
             except Exception as e:
                  print(f"Error during image path replacement for page {page_num_reported}: {e}")
     else:
     else:
         page_sizes = list() # Ensure page_sizes is a list if df is empty
+    # --- Prepare data *only* for the current page for display ---
     current_page_image_annotator_object = None
     if len(all_image_annotations) > page_num_reported_zero_indexed:
         page_data_for_display = all_image_annotations[page_num_reported_zero_indexed]
     Overwrite image annotations on the page we are moving from with modifications.
     '''
+    if current_page > len(page_sizes): raise Warning("Selected page is higher than last page number")
+    elif current_page <= 0: raise Warning("Selected page is lower than first page")
+    #print("all_image_annotations:", all_image_annotations)
+    #print("page_image_annotator_object:", page_image_annotator_object)
     previous_page_zero_index = previous_page -1
     if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
     else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
+    #print("all_image_annotations:", all_image_annotations)
     return all_image_annotations, current_page, current_page
 def apply_redactions_to_review_df_and_files(page_image_annotator_object:AnnotatedImageData,
     filtered_df = df.copy()
+    if not label_dropdown_value[0]: label_dropdown_value[0] = "ALL"
+    if not text_dropdown_value[0]: text_dropdown_value[0] = "ALL"
+    if not page_dropdown_value[0]: page_dropdown_value[0] = "1"
     recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
     recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
     if not "ALL" in choice:
         filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
+    if not choice[0]: choice[0] = "ALL"
+    if not text_dropdown_value[0]: text_dropdown_value[0] = "ALL"
+    if not page_dropdown_value[0]: page_dropdown_value[0] = "1"
     recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
     recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)