Merge pull request #56 from seanpedrick-case/review_updates
Browse filesUpdated review functions to update with manual reviews. Minor package update
- app.py +30 -15
- pyproject.toml +1 -1
- requirements.txt +1 -1
- tools/find_duplicate_pages.py +4 -4
- tools/redaction_review.py +58 -25
app.py
CHANGED
@@ -7,7 +7,7 @@ from tools.helper_functions import put_columns_in_df, get_connection_params, rev
|
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
10 |
-
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact
|
11 |
from tools.data_anonymise import anonymise_files_with_open_text
|
12 |
from tools.auth import authenticate_user
|
13 |
from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
|
@@ -756,7 +756,7 @@ with app:
|
|
756 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
757 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df], scroll_to_output=True)
|
758 |
|
759 |
-
# Save current page redactions
|
760 |
update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
761 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
762 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df])
|
@@ -772,6 +772,8 @@ with app:
|
|
772 |
|
773 |
# Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
|
774 |
recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
|
|
|
|
|
775 |
success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_df, selected_entity_id, selected_entity_colour], outputs=[review_file_df, selected_entity_id, selected_entity_colour]).\
|
776 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
777 |
success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
|
@@ -779,22 +781,28 @@ with app:
|
|
779 |
reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
|
780 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
781 |
|
782 |
-
|
783 |
# Exclude only selected row
|
784 |
-
exclude_selected_row_btn.click(
|
|
|
|
|
785 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
786 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
787 |
success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
|
788 |
|
789 |
# Exclude all items with same text as selected row
|
790 |
-
exclude_text_with_same_as_selected_row_btn.click(
|
791 |
-
|
|
|
|
|
792 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
793 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
794 |
success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
|
795 |
|
796 |
# Exclude everything visible in table
|
797 |
-
exclude_selected_btn.click(
|
|
|
|
|
798 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
799 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
800 |
success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
|
@@ -837,6 +845,8 @@ with app:
|
|
837 |
|
838 |
# Clicking on a cell in the redact items table will take you to that page
|
839 |
all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
|
|
|
|
|
840 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
841 |
success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
|
842 |
|
@@ -844,27 +854,32 @@ with app:
|
|
844 |
reset_dropdowns_btn_new.click(reset_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction]).\
|
845 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
846 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
847 |
# Reset redaction table following filtering
|
848 |
reset_ocr_with_words_df_btn.click(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
|
849 |
|
850 |
# Redact current selection
|
851 |
-
redact_selected_row_btn.click(
|
|
|
852 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
853 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
854 |
success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
|
855 |
|
856 |
# Redact all items with same text as selected row
|
857 |
-
redact_text_with_same_as_selected_row_btn.click(
|
858 |
-
success(
|
|
|
859 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
860 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
861 |
success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
|
862 |
|
863 |
-
|
864 |
-
redact_selected_btn.click(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[all_page_line_level_ocr_results_with_words_df, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
|
865 |
-
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
866 |
-
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
867 |
-
success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
|
868 |
|
869 |
# Undo last redaction action
|
870 |
undo_last_redact_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
|
|
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
10 |
+
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top, store_duplicate_selection, create_annotation_objects_from_filtered_ocr_results_with_words, df_select_callback_dataframe_row_ocr_with_words, update_redact_choice_df_from_page_dropdown, get_all_rows_with_same_text_redact,get_and_merge_current_page_annotations
|
11 |
from tools.data_anonymise import anonymise_files_with_open_text
|
12 |
from tools.auth import authenticate_user
|
13 |
from tools.load_spacy_model_custom_recognisers import custom_entities, load_spacy_model, download_tesseract_lang_pack
|
|
|
756 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
757 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df], scroll_to_output=True)
|
758 |
|
759 |
+
# Save current page manual redactions
|
760 |
update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
761 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
762 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df])
|
|
|
772 |
|
773 |
# Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
|
774 |
recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
|
775 |
+
success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
776 |
+
success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
|
777 |
success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_df, selected_entity_id, selected_entity_colour], outputs=[review_file_df, selected_entity_id, selected_entity_colour]).\
|
778 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
779 |
success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
|
|
|
781 |
reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
|
782 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
783 |
|
784 |
+
### Exclude current selection from annotator and outputs
|
785 |
# Exclude only selected row
|
786 |
+
exclude_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
787 |
+
success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
|
788 |
+
success(exclude_selected_items_from_redaction, inputs=[review_file_df, selected_entity_dataframe_row, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
|
789 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
790 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
791 |
success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
|
792 |
|
793 |
# Exclude all items with same text as selected row
|
794 |
+
exclude_text_with_same_as_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
795 |
+
success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
|
796 |
+
success(get_all_rows_with_same_text, inputs=[recogniser_entity_dataframe_base, selected_entity_dataframe_row_text], outputs=[recogniser_entity_dataframe_same_text]).\
|
797 |
+
success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe_same_text, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
|
798 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
799 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
800 |
success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
|
801 |
|
802 |
# Exclude everything visible in table
|
803 |
+
exclude_selected_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
804 |
+
success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
|
805 |
+
success(exclude_selected_items_from_redaction, inputs=[review_file_df, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
|
806 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
807 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
808 |
success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
|
|
|
845 |
|
846 |
# Clicking on a cell in the redact items table will take you to that page
|
847 |
all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
|
848 |
+
success(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
849 |
+
success(get_and_merge_current_page_annotations, inputs=[page_sizes, annotate_current_page, all_image_annotations_state, review_file_df], outputs=[review_file_df]).\
|
850 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
851 |
success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
|
852 |
|
|
|
854 |
reset_dropdowns_btn_new.click(reset_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction]).\
|
855 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
856 |
|
857 |
+
# Redact everything visible in table
|
858 |
+
redact_selected_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
859 |
+
success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[all_page_line_level_ocr_results_with_words_df, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
|
860 |
+
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
861 |
+
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
862 |
+
success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
|
863 |
+
|
864 |
# Reset redaction table following filtering
|
865 |
reset_ocr_with_words_df_btn.click(reset_ocr_with_words_base_dataframe, inputs=[all_page_line_level_ocr_results_with_words_df_base, page_entity_dropdown_redaction], outputs=[all_page_line_level_ocr_results_with_words_df, backup_all_page_line_level_ocr_results_with_words_df_base])
|
866 |
|
867 |
# Redact current selection
|
868 |
+
redact_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
869 |
+
success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[selected_entity_dataframe_row_redact, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state, recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
|
870 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
871 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
872 |
success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
|
873 |
|
874 |
# Redact all items with same text as selected row
|
875 |
+
redact_text_with_same_as_selected_row_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
876 |
+
success(get_all_rows_with_same_text_redact, inputs=[all_page_line_level_ocr_results_with_words_df_base, selected_entity_dataframe_row_text_redact], outputs=[to_redact_dataframe_same_text]).\
|
877 |
+
success(create_annotation_objects_from_filtered_ocr_results_with_words, inputs=[to_redact_dataframe_same_text, all_page_line_level_ocr_results_with_words_df_base, page_sizes, review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base, new_redaction_text_label, colour_label, annotate_current_page], outputs=[all_image_annotations_state, backup_image_annotations_state, review_file_df, backup_review_state,recogniser_entity_dataframe, backup_recogniser_entity_dataframe_base]).\
|
878 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state]).\
|
879 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, input_pdf_for_review, log_files_output, review_file_df]).\
|
880 |
success(update_all_entity_df_dropdowns, inputs=[all_page_line_level_ocr_results_with_words_df_base, recogniser_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown_redaction])
|
881 |
|
882 |
+
|
|
|
|
|
|
|
|
|
883 |
|
884 |
# Undo last redaction action
|
885 |
undo_last_redact_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
|
pyproject.toml
CHANGED
@@ -23,7 +23,7 @@ dependencies = [
|
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
-
"gradio==5.
|
27 |
"boto3==1.40.10",
|
28 |
"pyarrow==21.0.0",
|
29 |
"openpyxl==3.1.5",
|
|
|
23 |
"spacy==3.8.7",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
+
"gradio==5.44.0",
|
27 |
"boto3==1.40.10",
|
28 |
"pyarrow==21.0.0",
|
29 |
"openpyxl==3.1.5",
|
requirements.txt
CHANGED
@@ -10,7 +10,7 @@ pandas==2.3.1
|
|
10 |
scikit-learn==1.7.1
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
-
gradio==5.
|
14 |
boto3==1.40.10
|
15 |
pyarrow==21.0.0
|
16 |
openpyxl==3.1.5
|
|
|
10 |
scikit-learn==1.7.1
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
+
gradio==5.44.0
|
14 |
boto3==1.40.10
|
15 |
pyarrow==21.0.0
|
16 |
openpyxl==3.1.5
|
tools/find_duplicate_pages.py
CHANGED
@@ -570,7 +570,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
|
|
570 |
final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
|
571 |
|
572 |
output_paths.append(str(similarity_file_output_path))
|
573 |
-
print(f"Main results saved to {similarity_file_output_path}")
|
574 |
|
575 |
# 2. Save per-file redaction lists
|
576 |
# Use 'Page2_File' as the source of duplicate content
|
@@ -663,7 +663,7 @@ def find_consecutive_sequence_matches(
|
|
663 |
A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
|
664 |
consecutive match, or an empty DataFrame if no match is found.
|
665 |
"""
|
666 |
-
print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
|
667 |
|
668 |
# Step 1: Isolate the data for each file
|
669 |
search_df = df_filtered[df_filtered['file'] == search_file_name]
|
@@ -693,7 +693,7 @@ def find_consecutive_sequence_matches(
|
|
693 |
|
694 |
# Step 4: If the window matches the query with or without punctuation on end
|
695 |
if _sequences_match(query_tokens, window):
|
696 |
-
print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
|
697 |
|
698 |
# Get the global indices for this entire matching block
|
699 |
matching_reference_indices = reference_indices[i : i + query_len]
|
@@ -795,7 +795,7 @@ def identify_similar_text_sequences(
|
|
795 |
progress(0.7, desc="Aggregating results based on matching strategy")
|
796 |
|
797 |
if greedy_match or min_consecutive_pages > 1:
|
798 |
-
print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
|
799 |
|
800 |
# Sort the dataframe to ensure consecutive pages are adjacent
|
801 |
similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
|
|
|
570 |
final_df.to_csv(similarity_file_output_path, index=False, encoding="utf-8-sig")
|
571 |
|
572 |
output_paths.append(str(similarity_file_output_path))
|
573 |
+
#print(f"Main results saved to {similarity_file_output_path}")
|
574 |
|
575 |
# 2. Save per-file redaction lists
|
576 |
# Use 'Page2_File' as the source of duplicate content
|
|
|
663 |
A DataFrame with two columns ('Page1_Index', 'Page2_Index') mapping the
|
664 |
consecutive match, or an empty DataFrame if no match is found.
|
665 |
"""
|
666 |
+
#print(f"Starting sequence search for '{search_file_name}' in '{reference_file_name}'...")
|
667 |
|
668 |
# Step 1: Isolate the data for each file
|
669 |
search_df = df_filtered[df_filtered['file'] == search_file_name]
|
|
|
693 |
|
694 |
# Step 4: If the window matches the query with or without punctuation on end
|
695 |
if _sequences_match(query_tokens, window):
|
696 |
+
#print(f"Found a consecutive match starting at reference index: {reference_indices[i]}")
|
697 |
|
698 |
# Get the global indices for this entire matching block
|
699 |
matching_reference_indices = reference_indices[i : i + query_len]
|
|
|
795 |
progress(0.7, desc="Aggregating results based on matching strategy")
|
796 |
|
797 |
if greedy_match or min_consecutive_pages > 1:
|
798 |
+
#print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
|
799 |
|
800 |
# Sort the dataframe to ensure consecutive pages are adjacent
|
801 |
similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
|
tools/redaction_review.py
CHANGED
@@ -466,6 +466,33 @@ def _merge_horizontally_adjacent_boxes(
|
|
466 |
|
467 |
return merged_df
|
468 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
def create_annotation_objects_from_filtered_ocr_results_with_words(
|
470 |
filtered_ocr_results_with_words_df: pd.DataFrame,
|
471 |
ocr_results_with_words_df_base: pd.DataFrame,
|
@@ -475,6 +502,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
475 |
existing_recogniser_entity_df: pd.DataFrame,
|
476 |
redaction_label:str = "Redaction",
|
477 |
colour_label:str = '(0, 0, 0)',
|
|
|
478 |
progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
479 |
"""
|
480 |
This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
|
@@ -493,8 +521,17 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
493 |
"""
|
494 |
|
495 |
# Validate colour_label: must be a 3-number tuple with each value in [0, 255]
|
496 |
-
# If invalid, fallback to '(0, 0, 0
|
497 |
-
fallback_colour = '(0, 0, 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
498 |
try:
|
499 |
valid = False
|
500 |
if isinstance(colour_label, str):
|
@@ -507,7 +544,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
507 |
elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
|
508 |
r_val, g_val, b_val = colour_label
|
509 |
if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(0 <= v <= 255 for v in (r_val, g_val, b_val)):
|
510 |
-
colour_label = f'({r_val}, {g_val}, {b_val}
|
511 |
valid = True
|
512 |
if not valid:
|
513 |
colour_label = fallback_colour
|
@@ -573,7 +610,6 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
573 |
indicator=True
|
574 |
)
|
575 |
unique_new_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
|
576 |
-
#unique_new_df = new_annotations_df
|
577 |
|
578 |
print(f"Found {len(unique_new_df)} new unique annotations to add.")
|
579 |
gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
|
@@ -606,8 +642,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
606 |
merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
|
607 |
|
608 |
# 3. Sort the DataFrame based on this new custom order.
|
609 |
-
merged_df = merged_df.sort_values('image')
|
610 |
-
|
611 |
|
612 |
final_annotations_list = list()
|
613 |
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
@@ -616,11 +651,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
616 |
# DataFrame's current order, which we have just manually set. This is slightly
|
617 |
# more efficient than letting it sort again.
|
618 |
for image_path, group in merged_df.groupby('image', sort=False, observed=False):
|
619 |
-
# The progress.tqdm wrapper can be added back around the groupby object as you had it.
|
620 |
-
# for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
|
621 |
|
622 |
-
# Check if the group has actual annotations.
|
623 |
-
# without annotations will have one row with NaN values from the merge.
|
624 |
if pd.isna(group.iloc[0].get('id')):
|
625 |
boxes = list()
|
626 |
else:
|
@@ -636,6 +668,8 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
636 |
|
637 |
progress(1.0, desc="Completed annotation processing")
|
638 |
|
|
|
|
|
639 |
return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
|
640 |
|
641 |
def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
|
@@ -844,7 +878,6 @@ def update_annotator_object_and_filter_df(
|
|
844 |
review_df['page'] = pd.to_numeric(review_df['page'], errors='coerce').fillna(-1).astype(int)
|
845 |
review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
|
846 |
|
847 |
-
|
848 |
except Exception as e:
|
849 |
print(f"Error during image path replacement for page {page_num_reported}: {e}")
|
850 |
else:
|
@@ -857,7 +890,7 @@ def update_annotator_object_and_filter_df(
|
|
857 |
else:
|
858 |
page_sizes = list() # Ensure page_sizes is a list if df is empty
|
859 |
|
860 |
-
# ---
|
861 |
current_page_image_annotator_object = None
|
862 |
if len(all_image_annotations) > page_num_reported_zero_indexed:
|
863 |
page_data_for_display = all_image_annotations[page_num_reported_zero_indexed]
|
@@ -984,11 +1017,11 @@ def update_all_page_annotation_object_based_on_previous_page(
|
|
984 |
Overwrite image annotations on the page we are moving from with modifications.
|
985 |
'''
|
986 |
|
987 |
-
if current_page > len(page_sizes):
|
988 |
-
|
989 |
-
elif current_page <= 0:
|
990 |
-
raise Warning("Selected page is lower than first page")
|
991 |
|
|
|
|
|
992 |
|
993 |
previous_page_zero_index = previous_page -1
|
994 |
|
@@ -1000,6 +1033,8 @@ def update_all_page_annotation_object_based_on_previous_page(
|
|
1000 |
if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
|
1001 |
else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
|
1002 |
|
|
|
|
|
1003 |
return all_image_annotations, current_page, current_page
|
1004 |
|
1005 |
def apply_redactions_to_review_df_and_files(page_image_annotator_object:AnnotatedImageData,
|
@@ -1179,15 +1214,9 @@ def update_all_entity_df_dropdowns(df:pd.DataFrame, label_dropdown_value:str, pa
|
|
1179 |
|
1180 |
filtered_df = df.copy()
|
1181 |
|
1182 |
-
|
1183 |
-
|
1184 |
-
|
1185 |
-
|
1186 |
-
# if not "ALL" in text_dropdown_value:
|
1187 |
-
# filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
|
1188 |
-
|
1189 |
-
# if not "ALL" in label_dropdown_value:
|
1190 |
-
# filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
|
1191 |
|
1192 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
|
1193 |
recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
@@ -1224,6 +1253,10 @@ def update_entities_df_recogniser_entities(choice:str, df:pd.DataFrame, page_dro
|
|
1224 |
if not "ALL" in choice:
|
1225 |
filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
|
1226 |
|
|
|
|
|
|
|
|
|
1227 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
|
1228 |
recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
1229 |
|
|
|
466 |
|
467 |
return merged_df
|
468 |
|
469 |
+
def get_and_merge_current_page_annotations(
|
470 |
+
page_sizes: List[Dict],
|
471 |
+
annotate_current_page: int,
|
472 |
+
existing_annotations_list: List[Dict],
|
473 |
+
existing_annotations_df: pd.DataFrame
|
474 |
+
) -> pd.DataFrame:
|
475 |
+
"""
|
476 |
+
Function to extract and merge annotations for the current page
|
477 |
+
into the main existing_annotations_df.
|
478 |
+
"""
|
479 |
+
current_page_image = page_sizes[annotate_current_page - 1]["image_path"]
|
480 |
+
|
481 |
+
existing_annotations_current_page = [
|
482 |
+
item for item in existing_annotations_list if item["image"] == current_page_image
|
483 |
+
]
|
484 |
+
|
485 |
+
current_page_annotations_df = convert_annotation_data_to_dataframe(existing_annotations_current_page)
|
486 |
+
|
487 |
+
# Concatenate and clean, ensuring no duplicates and sorted order
|
488 |
+
updated_df = pd.concat(
|
489 |
+
[existing_annotations_df, current_page_annotations_df], ignore_index=True
|
490 |
+
).sort_values(by=["page", "xmin", "ymin"]).drop_duplicates(
|
491 |
+
subset=["id"], keep="first"
|
492 |
+
)
|
493 |
+
|
494 |
+
return updated_df
|
495 |
+
|
496 |
def create_annotation_objects_from_filtered_ocr_results_with_words(
|
497 |
filtered_ocr_results_with_words_df: pd.DataFrame,
|
498 |
ocr_results_with_words_df_base: pd.DataFrame,
|
|
|
502 |
existing_recogniser_entity_df: pd.DataFrame,
|
503 |
redaction_label:str = "Redaction",
|
504 |
colour_label:str = '(0, 0, 0)',
|
505 |
+
annotate_current_page:int = 1,
|
506 |
progress:gr.Progress=gr.Progress()) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
507 |
"""
|
508 |
This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
|
|
|
521 |
"""
|
522 |
|
523 |
# Validate colour_label: must be a 3-number tuple with each value in [0, 255]
|
524 |
+
# If invalid, fallback to '(0, 0, 0)' as requested
|
525 |
+
fallback_colour = '(0, 0, 0)'
|
526 |
+
|
527 |
+
|
528 |
+
existing_annotations_df = get_and_merge_current_page_annotations(
|
529 |
+
page_sizes,
|
530 |
+
annotate_current_page,
|
531 |
+
existing_annotations_list,
|
532 |
+
existing_annotations_df
|
533 |
+
)
|
534 |
+
|
535 |
try:
|
536 |
valid = False
|
537 |
if isinstance(colour_label, str):
|
|
|
544 |
elif isinstance(colour_label, (tuple, list)) and len(colour_label) == 3:
|
545 |
r_val, g_val, b_val = colour_label
|
546 |
if all(isinstance(v, int) for v in (r_val, g_val, b_val)) and all(0 <= v <= 255 for v in (r_val, g_val, b_val)):
|
547 |
+
colour_label = f'({r_val}, {g_val}, {b_val})'
|
548 |
valid = True
|
549 |
if not valid:
|
550 |
colour_label = fallback_colour
|
|
|
610 |
indicator=True
|
611 |
)
|
612 |
unique_new_df = merged[merged['_merge'] == 'left_only'].drop(columns=['_merge'])
|
|
|
613 |
|
614 |
print(f"Found {len(unique_new_df)} new unique annotations to add.")
|
615 |
gr.Info(f"Found {len(unique_new_df)} new unique annotations to add.")
|
|
|
642 |
merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
|
643 |
|
644 |
# 3. Sort the DataFrame based on this new custom order.
|
645 |
+
merged_df = merged_df.sort_values('image')
|
|
|
646 |
|
647 |
final_annotations_list = list()
|
648 |
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
|
|
651 |
# DataFrame's current order, which we have just manually set. This is slightly
|
652 |
# more efficient than letting it sort again.
|
653 |
for image_path, group in merged_df.groupby('image', sort=False, observed=False):
|
|
|
|
|
654 |
|
655 |
+
# Check if the group has actual annotations.
|
|
|
656 |
if pd.isna(group.iloc[0].get('id')):
|
657 |
boxes = list()
|
658 |
else:
|
|
|
668 |
|
669 |
progress(1.0, desc="Completed annotation processing")
|
670 |
|
671 |
+
print("final_annotations_list:", final_annotations_list)
|
672 |
+
|
673 |
return final_annotations_list, existing_annotations_list, updated_annotations_df, existing_annotations_df, updated_recogniser_entity_df, existing_recogniser_entity_df
|
674 |
|
675 |
def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
|
|
|
878 |
review_df['page'] = pd.to_numeric(review_df['page'], errors='coerce').fillna(-1).astype(int)
|
879 |
review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
|
880 |
|
|
|
881 |
except Exception as e:
|
882 |
print(f"Error during image path replacement for page {page_num_reported}: {e}")
|
883 |
else:
|
|
|
890 |
else:
|
891 |
page_sizes = list() # Ensure page_sizes is a list if df is empty
|
892 |
|
893 |
+
# --- Prepare data *only* for the current page for display ---
|
894 |
current_page_image_annotator_object = None
|
895 |
if len(all_image_annotations) > page_num_reported_zero_indexed:
|
896 |
page_data_for_display = all_image_annotations[page_num_reported_zero_indexed]
|
|
|
1017 |
Overwrite image annotations on the page we are moving from with modifications.
|
1018 |
'''
|
1019 |
|
1020 |
+
if current_page > len(page_sizes): raise Warning("Selected page is higher than last page number")
|
1021 |
+
elif current_page <= 0: raise Warning("Selected page is lower than first page")
|
|
|
|
|
1022 |
|
1023 |
+
#print("all_image_annotations:", all_image_annotations)
|
1024 |
+
#print("page_image_annotator_object:", page_image_annotator_object)
|
1025 |
|
1026 |
previous_page_zero_index = previous_page -1
|
1027 |
|
|
|
1033 |
if clear_all == False: all_image_annotations[previous_page_zero_index] = page_image_annotator_object
|
1034 |
else: all_image_annotations[previous_page_zero_index]["boxes"] = list()
|
1035 |
|
1036 |
+
#print("all_image_annotations:", all_image_annotations)
|
1037 |
+
|
1038 |
return all_image_annotations, current_page, current_page
|
1039 |
|
1040 |
def apply_redactions_to_review_df_and_files(page_image_annotator_object:AnnotatedImageData,
|
|
|
1214 |
|
1215 |
filtered_df = df.copy()
|
1216 |
|
1217 |
+
if not label_dropdown_value[0]: label_dropdown_value[0] = "ALL"
|
1218 |
+
if not text_dropdown_value[0]: text_dropdown_value[0] = "ALL"
|
1219 |
+
if not page_dropdown_value[0]: page_dropdown_value[0] = "1"
|
|
|
|
|
|
|
|
|
|
|
|
|
1220 |
|
1221 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
|
1222 |
recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
|
|
1253 |
if not "ALL" in choice:
|
1254 |
filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
|
1255 |
|
1256 |
+
if not choice[0]: choice[0] = "ALL"
|
1257 |
+
if not text_dropdown_value[0]: text_dropdown_value[0] = "ALL"
|
1258 |
+
if not page_dropdown_value[0]: page_dropdown_value[0] = "1"
|
1259 |
+
|
1260 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
|
1261 |
recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
1262 |
|