Sean Pedrick-Case commited on
Commit
bcf1a65
·
unverified ·
2 Parent(s): 58b064b 80268bb

Merge pull request #56 from seanpedrick-case/review_updates

Browse files

Updated review functions to update with manual reviews. Minor package update

app.py CHANGED
@@ -7,7 +7,7 @@ from tools.helper_functions import put_columns_in_df, get_connection_params, rev
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
- from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
11
  from tools.data_anonymise import anonymise_files_with_open_text
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
@@ -756,7 +756,7 @@ with app:
756
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
757
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df], scroll_to_output=True)
758
 
759
- # Save current page redactions
760
  update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
761
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
762
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df])
@@ -772,6 +772,8 @@ with app:
772
 
773
  # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
774
  recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
 
 
775
  success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_df, selected_entity_id, selected_entity_colour], outputs=[review_file_df, selected_entity_id, selected_entity_colour]).\
776
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
777
  success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
@@ -779,22 +781,28 @@ with app:
779
  reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
780
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
781
 
782
- # Exclude current selection from annotator and outputs
783
  # Exclude only selected row
784
- exclude_selected_row_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_df, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
 
 
785
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
786
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
787
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
788
 
789
  # Exclude all items with same text as selected row
790
- exclude_text_with_same_as_selected_row_btn.click(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
791
- success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
 
 
792
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
793
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
794
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
795
 
796
  # Exclude everything visible in table
797
- exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
 
 
798
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
799
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
800
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
@@ -837,6 +845,8 @@ with app:
837
 
838
  # Clicking on a cell in the redact items table will take you to that page
839
  all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
 
 
840
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
841
  success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
842
 
@@ -844,27 +854,32 @@ with app:
844
  reset_dropdowns_btn_new.click(reset_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction]).\
845
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
846
 
 
 
 
 
 
 
 
847
  # Reset redaction table following filtering
848
  reset_ocr_with_words_df_btn.click(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
849
 
850
  # Redact current selection
851
- redact_selected_row_btn.click(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[selected_entity_dataframe_row_redact, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
 
852
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
853
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
854
  success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
855
 
856
  # Redact all items with same text as selected row
857
- redact_text_with_same_as_selected_row_btn.click(get_all_rows_with_same_text_redact, inputs=[all_page_line_level_ocr_results_with_words_df_base, selected_entity_dataframe_row_text_redact], outputs=[to_redact_dataframe_same_text]).\
858
- success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[to_redact_dataframe_same_text, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state,recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
 
859
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
860
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
861
  success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
862
 
863
- # Redact everything visible in table
864
- redact_selected_btn.click(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[all_page_line_level_ocr_results_with_words_df, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
865
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
866
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
867
- success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
868
 
869
  # Undo last redaction action
870
  undo_last_redact_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
 
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
+ from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact,get_and_merge_current_page_annotations
11
  from tools.data_anonymise import anonymise_files_with_open_text
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
 
756
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
757
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df], scroll_to_output=True)
758
 
759
+ # Save current page manual redactions
760
  update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
761
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
762
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df])
 
772
 
773
  # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
774
  recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
775
+ success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
776
+ success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
777
  success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_df, selected_entity_id, selected_entity_colour], outputs=[review_file_df, selected_entity_id, selected_entity_colour]).\
778
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
779
  success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
 
781
  reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
782
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
783
 
784
+ ### Exclude current selection from annotator and outputs
785
  # Exclude only selected row
786
+ exclude_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
787
+ success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
788
+ success(exclude_selected_items_from_redaction, inputs=[review_file_df, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
789
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
790
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
791
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
792
 
793
  # Exclude all items with same text as selected row
794
+ exclude_text_with_same_as_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
795
+ success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
796
+ success(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
797
+ success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
798
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
799
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
800
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
801
 
802
  # Exclude everything visible in table
803
+ exclude_selected_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
804
+ success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
805
+ success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
806
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
807
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
808
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
 
845
 
846
  # Clicking on a cell in the redact items table will take you to that page
847
  all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
848
+ success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
849
+ success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
850
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
851
  success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
852
 
 
854
  reset_dropdowns_btn_new.click(reset_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction]).\
855
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
856
 
857
+ # Redact everything visible in table
858
+ redact_selected_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
859
+ success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[all_page_line_level_ocr_results_with_words_df, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
860
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
861
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
862
+ success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
863
+
864
  # Reset redaction table following filtering
865
  reset_ocr_with_words_df_btn.click(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
866
 
867
  # Redact current selection
868
+ redact_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
869
+ success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[selected_entity_dataframe_row_redact, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
870
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
871
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
872
  success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
873
 
874
  # Redact all items with same text as selected row
875
+ redact_text_with_same_as_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
876
+ success(get_all_rows_with_same_text_redact, inputs=[all_page_line_level_ocr_results_with_words_df_base, selected_entity_dataframe_row_text_redact], outputs=[to_redact_dataframe_same_text]).\
877
+ success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[to_redact_dataframe_same_text, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state,recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
878
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
879
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
880
  success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
881
 
882
+
 
 
 
 
883
 
884
  # Undo last redaction action
885
  undo_last_redact_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
pyproject.toml CHANGED
@@ -23,7 +23,7 @@ dependencies = [
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.43.1",
27
  "boto3==1.40.10",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
 
23
  "spacy==3.8.7",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.44.0",
27
  "boto3==1.40.10",
28
  "pyarrow==21.0.0",
29
  "openpyxl==3.1.5",
requirements.txt CHANGED
@@ -10,7 +10,7 @@ pandas==2.3.1
10
  scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.43.1
14
  boto3==1.40.10
15
  pyarrow==21.0.0
16
  openpyxl==3.1.5
 
10
  scikit-learn==1.7.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.44.0
14
  boto3==1.40.10
15
  pyarrow==21.0.0
16
  openpyxl==3.1.5
tools/find_duplicate_pages.py CHANGED
@@ -570,7 +570,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
570
  final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
571
 
572
  output_paths.append(str(similarity_file_output_path))
573
- print(f"Main results saved to {similarity_file_output_path}")
574
 
575
  # 2. Save per-file redaction lists
576
  # Use 'Page2_File' as the source of duplicate content
@@ -663,7 +663,7 @@ def find_consecutive_sequence_matches(
663
  A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
664
  consecutive match, or an empty DataFrame if no match is found.
665
  """
666
- print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
667
 
668
  # Step 1: Isolate the data for each file
669
  search_df = df_filtered[df_filtered['file'] == search_file_name]
@@ -693,7 +693,7 @@ def find_consecutive_sequence_matches(
693
 
694
  # Step 4: If the window matches the query with or without punctuation on end
695
  if _sequences_match(query_tokens, window):
696
- print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
697
 
698
  # Get the global indices for this entire matching block
699
  matching_reference_indices = reference_indices[i : i + query_len]
@@ -795,7 +795,7 @@ def identify_similar_text_sequences(
795
  progress(0.7, desc="Aggregating results based on matching strategy")
796
 
797
  if greedy_match or min_consecutive_pages > 1:
798
- print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
799
 
800
  # Sort the dataframe to ensure consecutive pages are adjacent
801
  similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
 
570
  final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
571
 
572
  output_paths.append(str(similarity_file_output_path))
573
+ #print(f"Main results saved to {similarity_file_output_path}")
574
 
575
  # 2. Save per-file redaction lists
576
  # Use 'Page2_File' as the source of duplicate content
 
663
  A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
664
  consecutive match, or an empty DataFrame if no match is found.
665
  """
666
+ #print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
667
 
668
  # Step 1: Isolate the data for each file
669
  search_df = df_filtered[df_filtered['file'] == search_file_name]
 
693
 
694
  # Step 4: If the window matches the query with or without punctuation on end
695
  if _sequences_match(query_tokens, window):
696
+ #print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
697
 
698
  # Get the global indices for this entire matching block
699
  matching_reference_indices = reference_indices[i : i + query_len]
 
795
  progress(0.7, desc="Aggregating results based on matching strategy")
796
 
797
  if greedy_match or min_consecutive_pages > 1:
798
+ #print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
799
 
800
  # Sort the dataframe to ensure consecutive pages are adjacent
801
  similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
tools/redaction_review.py CHANGED
@@ -466,6 +466,33 @@ def _merge_horizontally_adjacent_boxes(
466
 
467
  return merged_df
468
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
  def create_annotation_objects_from_filtered_ocr_results_with_words(
470
  filtered_ocr_results_with_words_df: pd.DataFrame,
471
  ocr_results_with_words_df_base: pd.DataFrame,
@@ -475,6 +502,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
475
  existing_recogniser_entity_df: pd.DataFrame,
476
  redaction_label:str = "Redaction",
477
  colour_label:str = '(0, 0, 0)',
 
478
  progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
479
  """
480
  This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
@@ -493,8 +521,17 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
493
  """
494
 
495
  # Validate colour_label: must be a 3-number tuple with each value in [0, 255]
496
- # If invalid, fallback to '(0, 0, 0,)' as requested
497
- fallback_colour = '(0, 0, 0,)'
 
 
 
 
 
 
 
 
 
498
  try:
499
  valid = False
500
  if isinstance(colour_label, str):
@@ -507,7 +544,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
507
  elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
508
  r_val, g_val, b_val = colour_label
509
  if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(0 <= v <= 255 for v in (r_val, g_val, b_val)):
510
- colour_label = f'({r_val}, {g_val}, {b_val},)'
511
  valid = True
512
  if not valid:
513
  colour_label = fallback_colour
@@ -573,7 +610,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
573
  indicator=True
574
  )
575
  unique_new_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
576
- #unique_new_df = new_annotations_df
577
 
578
  print(f"Found {len(unique_new_df)} new unique annotations to add.")
579
  gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
@@ -606,8 +642,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
606
  merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
607
 
608
  # 3. Sort the DataFrame based on this new custom order.
609
- merged_df = merged_df.sort_values('image')
610
-
611
 
612
  final_annotations_list = list()
613
  box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
@@ -616,11 +651,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
616
  # DataFrame's current order, which we have just manually set. This is slightly
617
  # more efficient than letting it sort again.
618
  for image_path, group in merged_df.groupby('image', sort=False, observed=False):
619
- # The progress.tqdm wrapper can be added back around the groupby object as you had it.
620
- # for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
621
 
622
- # Check if the group has actual annotations. iloc[0] is safe because even pages
623
- # without annotations will have one row with NaN values from the merge.
624
  if pd.isna(group.iloc[0].get('id')):
625
  boxes = list()
626
  else:
@@ -636,6 +668,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
636
 
637
  progress(1.0, desc="Completed annotation processing")
638
 
 
 
639
  return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
640
 
641
  def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
@@ -844,7 +878,6 @@ def update_annotator_object_and_filter_df(
844
  review_df['page'] = pd.to_numeric(review_df['page'], errors='coerce').fillna(-1).astype(int)
845
  review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
846
 
847
-
848
  except Exception as e:
849
  print(f"Error during image path replacement for page {page_num_reported}: {e}")
850
  else:
@@ -857,7 +890,7 @@ def update_annotator_object_and_filter_df(
857
  else:
858
  page_sizes = list() # Ensure page_sizes is a list if df is empty
859
 
860
- # --- OPTIMIZATION: Prepare data *only* for the current page for display ---
861
  current_page_image_annotator_object = None
862
  if len(all_image_annotations) > page_num_reported_zero_indexed:
863
  page_data_for_display = all_image_annotations[page_num_reported_zero_indexed]
@@ -984,11 +1017,11 @@ def update_all_page_annotation_object_based_on_previous_page(
984
  Overwrite image annotations on the page we are moving from with modifications.
985
  '''
986
 
987
- if current_page > len(page_sizes):
988
- raise Warning("Selected page is higher than last page number")
989
- elif current_page <= 0:
990
- raise Warning("Selected page is lower than first page")
991
 
 
 
992
 
993
  previous_page_zero_index = previous_page -1
994
 
@@ -1000,6 +1033,8 @@ def update_all_page_annotation_object_based_on_previous_page(
1000
  if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
1001
  else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
1002
 
 
 
1003
  return all_image_annotations, current_page, current_page
1004
 
1005
  def apply_redactions_to_review_df_and_files(page_image_annotator_object:AnnotatedImageData,
@@ -1179,15 +1214,9 @@ def update_all_entity_df_dropdowns(df:pd.DataFrame, label_dropdown_value:str, pa
1179
 
1180
  filtered_df = df.copy()
1181
 
1182
- # Apply filtering based on dropdown selections
1183
- # if not "ALL" in page_dropdown_value:
1184
- # filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
1185
-
1186
- # if not "ALL" in text_dropdown_value:
1187
- # filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
1188
-
1189
- # if not "ALL" in label_dropdown_value:
1190
- # filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
1191
 
1192
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
1193
  recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
@@ -1224,6 +1253,10 @@ def update_entities_df_recogniser_entities(choice:str, df:pd.DataFrame, page_dro
1224
  if not "ALL" in choice:
1225
  filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
1226
 
 
 
 
 
1227
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
1228
  recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
1229
 
 
466
 
467
  return merged_df
468
 
469
+ def get_and_merge_current_page_annotations(
470
+ page_sizes: List[Dict],
471
+ annotate_current_page: int,
472
+ existing_annotations_list: List[Dict],
473
+ existing_annotations_df: pd.DataFrame
474
+ ) -> pd.DataFrame:
475
+ """
476
+ Function to extract and merge annotations for the current page
477
+ into the main existing_annotations_df.
478
+ """
479
+ current_page_image = page_sizes[annotate_current_page - 1]["image_path"]
480
+
481
+ existing_annotations_current_page = [
482
+ item for item in existing_annotations_list if item["image"] == current_page_image
483
+ ]
484
+
485
+ current_page_annotations_df = convert_annotation_data_to_dataframe(existing_annotations_current_page)
486
+
487
+ # Concatenate and clean, ensuring no duplicates and sorted order
488
+ updated_df = pd.concat(
489
+ [existing_annotations_df, current_page_annotations_df], ignore_index=True
490
+ ).sort_values(by=["page", "xmin", "ymin"]).drop_duplicates(
491
+ subset=["id"], keep="first"
492
+ )
493
+
494
+ return updated_df
495
+
496
  def create_annotation_objects_from_filtered_ocr_results_with_words(
497
  filtered_ocr_results_with_words_df: pd.DataFrame,
498
  ocr_results_with_words_df_base: pd.DataFrame,
 
502
  existing_recogniser_entity_df: pd.DataFrame,
503
  redaction_label:str = "Redaction",
504
  colour_label:str = '(0, 0, 0)',
505
+ annotate_current_page:int = 1,
506
  progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
507
  """
508
  This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
 
521
  """
522
 
523
  # Validate colour_label: must be a 3-number tuple with each value in [0, 255]
524
+ # If invalid, fallback to '(0, 0, 0)' as requested
525
+ fallback_colour = '(0, 0, 0)'
526
+
527
+
528
+ existing_annotations_df = get_and_merge_current_page_annotations(
529
+ page_sizes,
530
+ annotate_current_page,
531
+ existing_annotations_list,
532
+ existing_annotations_df
533
+ )
534
+
535
  try:
536
  valid = False
537
  if isinstance(colour_label, str):
 
544
  elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
545
  r_val, g_val, b_val = colour_label
546
  if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(0 <= v <= 255 for v in (r_val, g_val, b_val)):
547
+ colour_label = f'({r_val}, {g_val}, {b_val})'
548
  valid = True
549
  if not valid:
550
  colour_label = fallback_colour
 
610
  indicator=True
611
  )
612
  unique_new_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
 
613
 
614
  print(f"Found {len(unique_new_df)} new unique annotations to add.")
615
  gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
 
642
  merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
643
 
644
  # 3. Sort the DataFrame based on this new custom order.
645
+ merged_df = merged_df.sort_values('image')
 
646
 
647
  final_annotations_list = list()
648
  box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
 
651
  # DataFrame's current order, which we have just manually set. This is slightly
652
  # more efficient than letting it sort again.
653
  for image_path, group in merged_df.groupby('image', sort=False, observed=False):
 
 
654
 
655
+ # Check if the group has actual annotations.
 
656
  if pd.isna(group.iloc[0].get('id')):
657
  boxes = list()
658
  else:
 
668
 
669
  progress(1.0, desc="Completed annotation processing")
670
 
671
+ print("final_annotations_list:", final_annotations_list)
672
+
673
  return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
674
 
675
  def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
 
878
  review_df['page'] = pd.to_numeric(review_df['page'], errors='coerce').fillna(-1).astype(int)
879
  review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
880
 
 
881
  except Exception as e:
882
  print(f"Error during image path replacement for page {page_num_reported}: {e}")
883
  else:
 
890
  else:
891
  page_sizes = list() # Ensure page_sizes is a list if df is empty
892
 
893
+ # --- Prepare data *only* for the current page for display ---
894
  current_page_image_annotator_object = None
895
  if len(all_image_annotations) > page_num_reported_zero_indexed:
896
  page_data_for_display = all_image_annotations[page_num_reported_zero_indexed]
 
1017
  Overwrite image annotations on the page we are moving from with modifications.
1018
  '''
1019
 
1020
+ if current_page > len(page_sizes): raise Warning("Selected page is higher than last page number")
1021
+ elif current_page <= 0: raise Warning("Selected page is lower than first page")
 
 
1022
 
1023
+ #print("all_image_annotations:", all_image_annotations)
1024
+ #print("page_image_annotator_object:", page_image_annotator_object)
1025
 
1026
  previous_page_zero_index = previous_page -1
1027
 
 
1033
  if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
1034
  else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
1035
 
1036
+ #print("all_image_annotations:", all_image_annotations)
1037
+
1038
  return all_image_annotations, current_page, current_page
1039
 
1040
  def apply_redactions_to_review_df_and_files(page_image_annotator_object:AnnotatedImageData,
 
1214
 
1215
  filtered_df = df.copy()
1216
 
1217
+ if not label_dropdown_value[0]: label_dropdown_value[0] = "ALL"
1218
+ if not text_dropdown_value[0]: text_dropdown_value[0] = "ALL"
1219
+ if not page_dropdown_value[0]: page_dropdown_value[0] = "1"
 
 
 
 
 
 
1220
 
1221
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
1222
  recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
 
1253
  if not "ALL" in choice:
1254
  filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
1255
 
1256
+ if not choice[0]: choice[0] = "ALL"
1257
+ if not text_dropdown_value[0]: text_dropdown_value[0] = "ALL"
1258
+ if not page_dropdown_value[0]: page_dropdown_value[0] = "1"
1259
+
1260
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
1261
  recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
1262