Spaces:
Sleeping
Sleeping
Commit
·
87e1451
1
Parent(s):
ee6b7fb
Added a multi word search tool on the redaction review tool page
Browse files- app.py +43 -6
- tools/file_redaction.py +13 -3
- tools/find_duplicate_pages.py +689 -180
- tools/helper_functions.py +5 -2
- tools/redaction_review.py +118 -31
app.py
CHANGED
@@ -12,7 +12,7 @@ from tools.data_anonymise import anonymise_data_files
|
|
12 |
from tools.auth import authenticate_user
|
13 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
14 |
from tools.custom_csvlogger import CSVLogger_custom
|
15 |
-
from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates
|
16 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
17 |
|
18 |
# Suppress downcasting warnings
|
@@ -409,8 +409,15 @@ with app:
|
|
409 |
with gr.Tab("Search and redact"):
|
410 |
with gr.Accordion("Search text", open=True):
|
411 |
with gr.Row(equal_height=True):
|
412 |
-
page_entity_dropdown_redaction = gr.Dropdown(label="Page", value="1", allow_custom_value=True)
|
413 |
-
reset_dropdowns_btn_new = gr.Button(value="Reset page filter")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
414 |
|
415 |
all_page_line_level_ocr_results_with_words_df = gr.Dataframe(pd.DataFrame(data={"page":[], "line":[], "word_text":[], "word_x0":[], "word_y0":[],"word_x1":[],"word_y1":[]}), type="pandas", label="Click table row to select and go to page", headers=["page", "line", "word_text", "word_x0","word_y0","word_x1","word_y1"], show_fullscreen_button=True, wrap=False, max_height=400, show_search="filter")
|
416 |
|
@@ -472,6 +479,7 @@ with app:
|
|
472 |
with gr.Row():
|
473 |
results_df_preview = gr.Dataframe(
|
474 |
label="Similarity Results",
|
|
|
475 |
wrap=True,
|
476 |
show_fullscreen_button=True,
|
477 |
show_search=True,
|
@@ -636,7 +644,7 @@ with app:
|
|
636 |
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox])
|
637 |
|
638 |
# Run redaction function
|
639 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
640 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
641 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
|
642 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
|
@@ -690,7 +698,7 @@ with app:
|
|
690 |
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_page_line_level_ocr_results_df_base, all_page_line_level_ocr_results_with_words_df_base, latest_file_completed_num, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_page_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox, all_page_line_level_ocr_results_with_words_df_base], api_name="prepare_doc").\
|
691 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
692 |
|
693 |
-
# Manual updates to review
|
694 |
review_file_df.input(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
695 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
696 |
|
@@ -782,6 +790,32 @@ with app:
|
|
782 |
###
|
783 |
page_entity_dropdown_redaction.select(update_redact_choice_df_from_page_dropdown, inputs=[page_entity_dropdown_redaction, all_page_line_level_ocr_results_with_words_df_base], outputs=[all_page_line_level_ocr_results_with_words_df])
|
784 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
785 |
# Clicking on a cell in the redact items table will take you to that page
|
786 |
all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
|
787 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
@@ -871,7 +905,7 @@ with app:
|
|
871 |
],
|
872 |
outputs=[
|
873 |
results_df_preview,
|
874 |
-
duplicate_files_out,
|
875 |
full_duplicate_data_by_file
|
876 |
]
|
877 |
)
|
@@ -896,6 +930,8 @@ with app:
|
|
896 |
outputs=[review_file_df, all_image_annotations_state]).\
|
897 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
898 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
|
|
|
|
899 |
|
900 |
###
|
901 |
# SETTINGS PAGE INPUT / OUTPUT
|
@@ -910,6 +946,7 @@ with app:
|
|
910 |
in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
|
911 |
in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
|
912 |
|
|
|
913 |
apply_fully_redacted_list_btn.click(
|
914 |
fn=apply_whole_page_redactions_from_list,
|
915 |
inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
|
|
|
12 |
from tools.auth import authenticate_user
|
13 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
14 |
from tools.custom_csvlogger import CSVLogger_custom
|
15 |
+
from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
|
16 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
17 |
|
18 |
# Suppress downcasting warnings
|
|
|
409 |
with gr.Tab("Search and redact"):
|
410 |
with gr.Accordion("Search text", open=True):
|
411 |
with gr.Row(equal_height=True):
|
412 |
+
page_entity_dropdown_redaction = gr.Dropdown(label="Page", value="1", allow_custom_value=True, scale=4)
|
413 |
+
reset_dropdowns_btn_new = gr.Button(value="Reset page filter", scale=1)
|
414 |
+
|
415 |
+
with gr.Row():
|
416 |
+
multi_word_search_text = gr.Textbox(label="Multi-word text search", value="", scale=4)
|
417 |
+
multi_word_search_text_btn = gr.Button(value="Search", scale=1)
|
418 |
+
|
419 |
+
with gr.Accordion("Search options", open=False):
|
420 |
+
similarity_search_score_minimum = gr.Number(value=1.0, minimum=0.4, maximum=1.0, label="Minimum similarity score for match (max=1)")
|
421 |
|
422 |
all_page_line_level_ocr_results_with_words_df = gr.Dataframe(pd.DataFrame(data={"page":[], "line":[], "word_text":[], "word_x0":[], "word_y0":[],"word_x1":[],"word_y1":[]}), type="pandas", label="Click table row to select and go to page", headers=["page", "line", "word_text", "word_x0","word_y0","word_x1","word_y1"], show_fullscreen_button=True, wrap=False, max_height=400, show_search="filter")
|
423 |
|
|
|
479 |
with gr.Row():
|
480 |
results_df_preview = gr.Dataframe(
|
481 |
label="Similarity Results",
|
482 |
+
headers=["Page1_File", "Page1_Start_Page", "Page1_End_Page", "Page2_File", "Page2_Start_Page", "Page2_End_Page", "Match_Length", "Avg_Similarity", "Page1_Text", "Page2_Text"],
|
483 |
wrap=True,
|
484 |
show_fullscreen_button=True,
|
485 |
show_search=True,
|
|
|
644 |
success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox])
|
645 |
|
646 |
# Run redaction function
|
647 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
|
648 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
649 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
|
650 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
|
|
|
698 |
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_page_line_level_ocr_results_df_base, all_page_line_level_ocr_results_with_words_df_base, latest_file_completed_num, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_page_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox, all_page_line_level_ocr_results_with_words_df_base], api_name="prepare_doc").\
|
699 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
700 |
|
701 |
+
# Manual updates to review df
|
702 |
review_file_df.input(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
703 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
704 |
|
|
|
790 |
###
|
791 |
page_entity_dropdown_redaction.select(update_redact_choice_df_from_page_dropdown, inputs=[page_entity_dropdown_redaction, all_page_line_level_ocr_results_with_words_df_base], outputs=[all_page_line_level_ocr_results_with_words_df])
|
792 |
|
793 |
+
multi_word_search_text.submit(
|
794 |
+
fn=run_full_search_and_analysis,
|
795 |
+
inputs=[
|
796 |
+
multi_word_search_text,
|
797 |
+
all_page_line_level_ocr_results_with_words_df_base,
|
798 |
+
similarity_search_score_minimum
|
799 |
+
],
|
800 |
+
outputs=[
|
801 |
+
all_page_line_level_ocr_results_with_words_df,
|
802 |
+
duplicate_files_out,
|
803 |
+
full_duplicate_data_by_file
|
804 |
+
])
|
805 |
+
|
806 |
+
multi_word_search_text_btn.click(
|
807 |
+
fn=run_full_search_and_analysis,
|
808 |
+
inputs=[
|
809 |
+
multi_word_search_text,
|
810 |
+
all_page_line_level_ocr_results_with_words_df_base,
|
811 |
+
similarity_search_score_minimum
|
812 |
+
],
|
813 |
+
outputs=[
|
814 |
+
all_page_line_level_ocr_results_with_words_df,
|
815 |
+
duplicate_files_out,
|
816 |
+
full_duplicate_data_by_file
|
817 |
+
])
|
818 |
+
|
819 |
# Clicking on a cell in the redact items table will take you to that page
|
820 |
all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
|
821 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
|
|
905 |
],
|
906 |
outputs=[
|
907 |
results_df_preview,
|
908 |
+
duplicate_files_out,
|
909 |
full_duplicate_data_by_file
|
910 |
]
|
911 |
)
|
|
|
930 |
outputs=[review_file_df, all_image_annotations_state]).\
|
931 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
932 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
|
933 |
+
|
934 |
+
|
935 |
|
936 |
###
|
937 |
# SETTINGS PAGE INPUT / OUTPUT
|
|
|
946 |
in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
|
947 |
in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
|
948 |
|
949 |
+
# Apply whole page redactions from the provided whole page redaction csv file upload/list of specific page numbers given by user
|
950 |
apply_fully_redacted_list_btn.click(
|
951 |
fn=apply_whole_page_redactions_from_list,
|
952 |
inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
|
tools/file_redaction.py
CHANGED
@@ -104,7 +104,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
104 |
page_min:int=0,
|
105 |
page_max:int=999,
|
106 |
estimated_time_taken_state:float=0.0,
|
107 |
-
handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
|
108 |
all_request_metadata_str:str = "",
|
109 |
annotations_all_pages:List[dict]=list(),
|
110 |
all_page_line_level_ocr_results_df:pd.DataFrame=None,
|
@@ -132,7 +132,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
132 |
ocr_file_path:str="",
|
133 |
all_page_line_level_ocr_results:list[dict] = list(),
|
134 |
all_page_line_level_ocr_results_with_words:list[dict] = list(),
|
135 |
-
all_page_line_level_ocr_results_with_words_df:pd.DataFrame=
|
136 |
prepare_images:bool=True,
|
137 |
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
138 |
progress=gr.Progress(track_tqdm=True)):
|
@@ -202,6 +202,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
202 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
203 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
204 |
|
|
|
|
|
|
|
|
|
|
|
205 |
# Create copies of out_file_path objects to avoid overwriting each other on append actions
|
206 |
out_file_paths = out_file_paths.copy()
|
207 |
log_files_output_paths = log_files_output_paths.copy()
|
@@ -663,6 +668,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
663 |
if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
|
664 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
665 |
|
|
|
|
|
|
|
666 |
# Convert the gradio annotation boxes to relative coordinates
|
667 |
progress(0.93, "Creating review file output")
|
668 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
@@ -1203,7 +1211,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
1203 |
# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
|
1204 |
###
|
1205 |
|
1206 |
-
def merge_img_bboxes(bboxes, combined_results: Dict, page_signature_recogniser_results=[], page_handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Extract handwriting", "Extract signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
1207 |
|
1208 |
all_bboxes = []
|
1209 |
merged_bboxes = []
|
@@ -1385,6 +1393,8 @@ def redact_image_pdf(file_path:str,
|
|
1385 |
|
1386 |
tic = time.perf_counter()
|
1387 |
|
|
|
|
|
1388 |
file_name = get_file_name_without_type(file_path)
|
1389 |
comprehend_query_number_new = 0
|
1390 |
|
|
|
104 |
page_min:int=0,
|
105 |
page_max:int=999,
|
106 |
estimated_time_taken_state:float=0.0,
|
107 |
+
handwrite_signature_checkbox:List[str]=list(["Extract handwriting", "Extract signatures"]),
|
108 |
all_request_metadata_str:str = "",
|
109 |
annotations_all_pages:List[dict]=list(),
|
110 |
all_page_line_level_ocr_results_df:pd.DataFrame=None,
|
|
|
132 |
ocr_file_path:str="",
|
133 |
all_page_line_level_ocr_results:list[dict] = list(),
|
134 |
all_page_line_level_ocr_results_with_words:list[dict] = list(),
|
135 |
+
all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
|
136 |
prepare_images:bool=True,
|
137 |
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
138 |
progress=gr.Progress(track_tqdm=True)):
|
|
|
202 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
203 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
204 |
|
205 |
+
print("all_page_line_level_ocr_results_with_words at start of choose and run...:", all_page_line_level_ocr_results_with_words)
|
206 |
+
|
207 |
+
if all_page_line_level_ocr_results_with_words_df is None:
|
208 |
+
all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
|
209 |
+
|
210 |
# Create copies of out_file_path objects to avoid overwriting each other on append actions
|
211 |
out_file_paths = out_file_paths.copy()
|
212 |
log_files_output_paths = log_files_output_paths.copy()
|
|
|
668 |
if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
|
669 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
670 |
|
671 |
+
if all_page_line_level_ocr_results_with_words_df_file_path not in out_file_paths:
|
672 |
+
out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
673 |
+
|
674 |
# Convert the gradio annotation boxes to relative coordinates
|
675 |
progress(0.93, "Creating review file output")
|
676 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
|
|
1211 |
# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
|
1212 |
###
|
1213 |
|
1214 |
+
def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogniser_results=[], page_handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Extract handwriting", "Extract signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
1215 |
|
1216 |
all_bboxes = []
|
1217 |
merged_bboxes = []
|
|
|
1393 |
|
1394 |
tic = time.perf_counter()
|
1395 |
|
1396 |
+
print("all_page_line_level_ocr_results_with_words in redact_image_pdf:", all_page_line_level_ocr_results_with_words)
|
1397 |
+
|
1398 |
file_name = get_file_name_without_type(file_path)
|
1399 |
comprehend_query_number_new = 0
|
1400 |
|
tools/find_duplicate_pages.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
import re
|
|
|
|
|
4 |
from tools.helper_functions import OUTPUT_FOLDER
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
-
from typing import List, Tuple, Optional, Dict
|
8 |
from collections import defaultdict
|
9 |
import gradio as gr
|
10 |
from gradio import Progress
|
@@ -16,76 +18,400 @@ import en_core_web_lg
|
|
16 |
nlp = en_core_web_lg.load()
|
17 |
|
18 |
similarity_threshold = 0.95
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
"""
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
Returns:
|
29 |
-
pd.DataFrame:
|
|
|
|
|
|
|
30 |
"""
|
31 |
-
|
32 |
-
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
else:
|
37 |
-
|
38 |
-
|
39 |
-
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
else:
|
44 |
-
file_path = file.name
|
45 |
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
if 'page' not in df.columns or 'text' not in df.columns:
|
51 |
-
print(f"Warning: Skipping {
|
52 |
continue
|
53 |
|
|
|
54 |
df['text'] = df['text'].fillna('').astype(str)
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
else:
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
-
grouped = df #.drop('line_number_by_page', axis=1)
|
66 |
-
|
67 |
-
# Add filename column
|
68 |
-
grouped['file'] = os.path.basename(file_path)
|
69 |
-
|
70 |
-
all_data.append(grouped)
|
71 |
-
|
72 |
if not all_data:
|
73 |
-
raise ValueError("No valid
|
74 |
-
|
75 |
-
#
|
76 |
combined_df = pd.concat(all_data, ignore_index=True)
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
-
#
|
79 |
-
|
80 |
|
81 |
-
|
82 |
-
combined_df.
|
83 |
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
'''
|
90 |
Clean and stem text columns in a data frame
|
91 |
'''
|
@@ -213,142 +539,312 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
|
|
213 |
|
214 |
return output_paths
|
215 |
|
216 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
217 |
df_combined: pd.DataFrame,
|
218 |
-
similarity_threshold: float =
|
219 |
-
min_word_count: int =
|
220 |
min_consecutive_pages: int = 1,
|
221 |
-
greedy_match: bool =
|
222 |
-
combine_pages:bool=
|
223 |
-
|
|
|
|
|
224 |
progress=Progress(track_tqdm=True)
|
225 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
226 |
"""
|
227 |
-
Identifies similar pages
|
228 |
-
1. Single Page: If greedy_match=False and min_consecutive_pages=1.
|
229 |
-
2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
|
230 |
-
3. Greedy Consecutive Match: If greedy_match=True.
|
231 |
"""
|
232 |
-
|
233 |
-
output_paths = []
|
234 |
progress(0.1, desc="Processing and filtering text")
|
235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
236 |
df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
|
|
|
237 |
original_row_count = len(df)
|
238 |
df_filtered = df[df['word_count'] >= min_word_count].copy()
|
239 |
df_filtered.reset_index(drop=True, inplace=True)
|
240 |
-
|
241 |
print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
|
242 |
-
|
243 |
if len(df_filtered) < 2:
|
244 |
return pd.DataFrame(), [], df_combined
|
245 |
-
|
246 |
-
vectorizer = TfidfVectorizer()
|
247 |
-
tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
|
248 |
-
|
249 |
-
progress(0.3, desc="Calculating text similarity")
|
250 |
-
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
|
251 |
-
coo_matrix = similarity_matrix.tocoo()
|
252 |
-
|
253 |
-
# Create a DataFrame of all individual page pairs above the threshold.
|
254 |
-
# This is the base for all three matching strategies.
|
255 |
-
similar_pages = [
|
256 |
-
(r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
|
257 |
-
if r < c and v >= similarity_threshold
|
258 |
-
]
|
259 |
-
|
260 |
-
if not similar_pages:
|
261 |
-
return pd.DataFrame(), [], df_combined
|
262 |
-
|
263 |
-
base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
264 |
|
265 |
-
|
266 |
|
267 |
-
if
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
# Keep track of indices that have been used in a sequence
|
274 |
-
consumed_indices_1 = set()
|
275 |
-
consumed_indices_2 = set()
|
276 |
-
|
277 |
-
all_sequences = []
|
278 |
-
|
279 |
-
# Iterate through all potential starting pairs, sorted for consistent results
|
280 |
-
sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
|
281 |
-
|
282 |
-
for _, row in sorted_pairs.iterrows():
|
283 |
-
start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
|
|
|
|
|
288 |
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
while True:
|
293 |
-
next_idx1 = start_idx1 + k
|
294 |
-
next_idx2 = start_idx2 + k
|
295 |
-
|
296 |
-
# Check if the next pair in the sequence is a valid match
|
297 |
-
if (next_idx1, next_idx2) in valid_pairs_set and \
|
298 |
-
next_idx1 not in consumed_indices_1 and \
|
299 |
-
next_idx2 not in consumed_indices_2:
|
300 |
-
current_sequence.append((next_idx1, next_idx2))
|
301 |
-
k += 1
|
302 |
-
else:
|
303 |
-
# The sequence has ended
|
304 |
-
break
|
305 |
-
|
306 |
-
# Record the found sequence and mark all its pages as consumed
|
307 |
-
sequence_indices_1 = [p[0] for p in current_sequence]
|
308 |
-
sequence_indices_2 = [p[1] for p in current_sequence]
|
309 |
-
|
310 |
-
all_sequences.append({
|
311 |
-
'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
|
312 |
-
'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
|
313 |
-
'Match_Length': len(current_sequence)
|
314 |
-
})
|
315 |
|
316 |
-
|
317 |
-
|
|
|
|
|
318 |
|
319 |
-
if not
|
320 |
return pd.DataFrame(), [], df_combined
|
|
|
|
|
|
|
|
|
321 |
|
322 |
-
|
|
|
|
|
|
|
|
|
323 |
|
324 |
-
|
325 |
-
#
|
326 |
-
print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
|
327 |
-
similarity_df = base_similarity_df.copy()
|
328 |
-
similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
|
329 |
is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
|
|
|
|
|
|
|
330 |
block_id = is_consecutive.eq(False).cumsum()
|
|
|
|
|
331 |
grouped = similarity_df.groupby(block_id)
|
|
|
|
|
332 |
agg_results = grouped.agg(
|
333 |
-
Page1_Start_Index=('Page1_Index', 'first'),
|
334 |
-
|
335 |
-
|
|
|
|
|
|
|
336 |
).reset_index(drop=True)
|
337 |
-
subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
|
338 |
-
if subdocument_df.empty: return pd.DataFrame(), [], df_combined
|
339 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
340 |
else:
|
341 |
-
|
342 |
-
|
343 |
final_df = map_metadata_single_page(base_similarity_df, df_filtered)
|
344 |
-
|
345 |
-
|
|
|
|
|
|
|
346 |
|
347 |
-
# --- Map metadata and format output ---
|
348 |
-
# This block now handles the output for both subdocument strategies (2 and 3)
|
349 |
-
if greedy_match or min_consecutive_pages > 1:
|
350 |
-
final_df = map_metadata_subdocument(subdocument_df, df_filtered)
|
351 |
-
|
352 |
progress(0.8, desc="Saving output files")
|
353 |
|
354 |
output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
|
@@ -405,18 +901,16 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
405 |
Wrapper function updated to include the 'greedy_match' boolean.
|
406 |
"""
|
407 |
if not files:
|
408 |
-
|
409 |
-
return None, None, None
|
410 |
|
411 |
progress(0, desc="Combining input files...")
|
412 |
-
df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages)
|
413 |
|
414 |
if df_combined.empty:
|
415 |
-
|
416 |
-
return None, None, None
|
417 |
|
418 |
# Call the main analysis function with the new parameter
|
419 |
-
results_df, output_paths, full_df =
|
420 |
df_combined=df_combined,
|
421 |
similarity_threshold=threshold,
|
422 |
min_word_count=min_words,
|
@@ -428,7 +922,6 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
428 |
|
429 |
# Clip text to first 200 characters
|
430 |
full_df['text'] = full_df['text'].str[:preview_length]
|
431 |
-
|
432 |
# Preprocess full_data (without preview text) for fast access (run once)
|
433 |
full_data_by_file = {
|
434 |
file: df.sort_values('page').set_index('page')
|
@@ -438,7 +931,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
438 |
if results_df.empty:
|
439 |
gr.Info(f"No duplicate pages found, no results returned.")
|
440 |
|
441 |
-
return results_df, output_paths, full_data_by_file
|
442 |
|
443 |
def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
|
444 |
"""
|
@@ -523,10 +1016,29 @@ def add_new_annotations_to_existing_page_annotations(
|
|
523 |
|
524 |
return all_annotations, newly_added_annotation_group
|
525 |
|
526 |
-
def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=
|
527 |
'''
|
528 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
529 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
530 |
all_annotations = all_existing_annotations.copy()
|
531 |
|
532 |
if not pymupdf_doc:
|
@@ -659,34 +1171,27 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
|
|
659 |
review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
|
660 |
review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
|
661 |
|
662 |
-
out_message = "Successfully created
|
663 |
print(out_message)
|
664 |
gr.Info(out_message)
|
665 |
|
666 |
return review_file_out, all_annotations
|
667 |
|
668 |
-
# --- 1. Helper Function to Parse the Combined Page/Line ID ---
|
669 |
def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
|
670 |
-
"""
|
671 |
-
|
672 |
-
|
673 |
-
|
674 |
-
|
675 |
-
|
676 |
-
|
677 |
-
"""
|
678 |
-
# zfill ensures the string is padded with leading zeros to 10 characters
|
679 |
-
s_id = str(combined_id).zfill(10)
|
680 |
-
page = int(s_id[:5])
|
681 |
-
line = int(s_id[5:])
|
682 |
return page, line
|
683 |
|
684 |
def create_annotation_objects_from_duplicates(
|
685 |
duplicates_df: pd.DataFrame,
|
686 |
ocr_results_df: pd.DataFrame,
|
687 |
page_sizes: List[Dict],
|
688 |
-
combine_pages:bool=False
|
689 |
-
) -> List[Dict]:
|
690 |
"""
|
691 |
Creates structured annotation objects from duplicate line ranges, mapping
|
692 |
page numbers to image paths.
|
@@ -702,8 +1207,12 @@ def create_annotation_objects_from_duplicates(
|
|
702 |
"""
|
703 |
final_output = []
|
704 |
|
|
|
|
|
|
|
|
|
|
|
705 |
if combine_pages == False:
|
706 |
-
# --- NEW: Create an efficient lookup map from page number to image path ---
|
707 |
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
708 |
|
709 |
# Prepare OCR Data: Add a line number column if it doesn't exist
|
|
|
1 |
import pandas as pd
|
2 |
import os
|
3 |
import re
|
4 |
+
import itertools # For getting file pairs
|
5 |
+
import numpy as np # For efficient index finding
|
6 |
from tools.helper_functions import OUTPUT_FOLDER
|
7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
+
from typing import List, Tuple, Optional, Dict, Union
|
10 |
from collections import defaultdict
|
11 |
import gradio as gr
|
12 |
from gradio import Progress
|
|
|
18 |
nlp = en_core_web_lg.load()
|
19 |
|
20 |
similarity_threshold = 0.95
|
21 |
+
number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
|
22 |
+
ID_MULTIPLIER = 100000
|
23 |
+
|
24 |
+
def extract_indices_from_page_ranges(
|
25 |
+
results_df: pd.DataFrame,
|
26 |
+
start_col: str = 'Page2_Start_Page',
|
27 |
+
end_col: str = 'Page2_End_Page',
|
28 |
+
modulo_divisor_number_of_zeros: int = number_of_zeros_to_add_to_index, # Search for number of added
|
29 |
+
converted_index: bool = False # Has the index been converted to the page_no + 0000 + line number format that needs the modulo divisor to convert back?
|
30 |
+
) -> List[int]:
|
31 |
+
all_indices = set()
|
32 |
+
modulo_divisor = int("1" + modulo_divisor_number_of_zeros*"0")
|
33 |
+
|
34 |
+
for _, row in results_df.iterrows():
|
35 |
+
start_page = row[start_col]
|
36 |
+
end_page = row[end_col]
|
37 |
+
for encoded_page_id in range(start_page, end_page + 1):
|
38 |
+
if converted_index == True:
|
39 |
+
original_page, original_index = _parse_page_line_id(encoded_page_id)#(encoded_page_id % modulo_divisor) - 1
|
40 |
+
else:
|
41 |
+
original_index = encoded_page_id
|
42 |
+
|
43 |
+
all_indices.add(original_index)
|
44 |
+
return sorted(list(all_indices))
|
45 |
|
46 |
+
def run_full_search_and_analysis(
|
47 |
+
search_query_text: str,
|
48 |
+
word_level_df_orig: pd.DataFrame,
|
49 |
+
similarity_threshold: float = 1,
|
50 |
+
combine_pages: bool = False,
|
51 |
+
min_word_count: int = 1,
|
52 |
+
min_consecutive_pages: int = 1,
|
53 |
+
greedy_match: bool = True,
|
54 |
+
remake_index: bool = False,
|
55 |
+
progress=gr.Progress(track_tqdm=True)
|
56 |
+
):
|
57 |
"""
|
58 |
+
This function orchestrates the entire pipeline for finding duplicate pages based on a user's search query. It takes in the search query text, the original word-level OCR data, and various parameters to control the analysis. The function then:
|
59 |
+
|
60 |
+
1. Converts the user's search query into a DataFrame format suitable for analysis.
|
61 |
+
2. Prepares the main word-level OCR data for processing by converting it into the required format.
|
62 |
+
3. Combines the search query DataFrame with the prepared OCR data DataFrame.
|
63 |
+
4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
|
64 |
+
|
65 |
+
Parameters:
|
66 |
+
- search_query_text (str): The text entered by the user to search for in the OCR data.
|
67 |
+
- word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
|
68 |
+
- similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
|
69 |
+
- combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
|
70 |
+
- min_word_count (int, optional): The minimum number of words required for a page to be considered in the analysis. Defaults to 1.
|
71 |
+
- min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
|
72 |
+
- greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
|
73 |
+
- remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
|
74 |
+
- progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
|
75 |
+
"""
|
76 |
+
|
77 |
+
if len(search_query_text) < 3:
|
78 |
+
raise Warning("Please use a search query with at least three letters.")
|
79 |
+
if len(search_query_text) > 100:
|
80 |
+
raise Warning("Please use a search query with at less than 100 characters.")
|
81 |
+
|
82 |
+
# Step 1: Process the user's search query string
|
83 |
+
search_query_data, query_word_length = create_dataframe_from_string(search_query_text, split_words=True)
|
84 |
+
if not search_query_data:
|
85 |
+
# Handle case where user submits an empty search string
|
86 |
+
raise Warning("Could not convert search string to required format")
|
87 |
+
|
88 |
+
if query_word_length > 10:
|
89 |
+
# Handle case where user submits an empty search string
|
90 |
+
raise Warning("Please use a query with less than 10 words")
|
91 |
+
|
92 |
+
# Overwrite min_consecutive_pages with the search string length
|
93 |
+
min_consecutive_pages = query_word_length
|
94 |
|
95 |
+
# Create word index from reference table
|
96 |
+
word_level_df_orig["index"] = word_level_df_orig.index
|
97 |
+
word_level_df = word_level_df_orig.copy()
|
98 |
+
|
99 |
+
# Step 2: Process the main word-level OCR DataFrame
|
100 |
+
word_level_data = convert_word_level_df(word_level_df, file_name="source_document")
|
101 |
+
|
102 |
+
# Step 3: Combine both data sources into one list
|
103 |
+
all_data_to_process = search_query_data + word_level_data
|
104 |
+
if not all_data_to_process:
|
105 |
+
raise gr.Error("No data to process. Please check your inputs.")
|
106 |
|
107 |
+
# Step 4: Run the combination logic
|
108 |
+
combined_df, _, full_out_ocr_df = combine_ocr_dataframes(
|
109 |
+
input_data=all_data_to_process,
|
110 |
+
combine_pages=combine_pages,
|
111 |
+
output_folder=None, # No need to save this intermediate file
|
112 |
+
remake_index=remake_index
|
113 |
+
)
|
114 |
+
|
115 |
+
# Step 5: Run the final similarity analysis on the combined data
|
116 |
+
results_df, duplicate_files, full_data = identify_similar_text_sequences(
|
117 |
+
df_combined=combined_df,
|
118 |
+
similarity_threshold=similarity_threshold,
|
119 |
+
min_word_count=min_word_count,
|
120 |
+
min_consecutive_pages=min_consecutive_pages,
|
121 |
+
greedy_match=greedy_match,
|
122 |
+
combine_pages=combine_pages,
|
123 |
+
inter_file_only=True,
|
124 |
+
do_text_clean=False,
|
125 |
+
progress=progress
|
126 |
+
)
|
127 |
+
|
128 |
+
print("Finished text search")
|
129 |
+
|
130 |
+
# Map the results back to the reference data file
|
131 |
+
if remake_index == True:
|
132 |
+
results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=True)
|
133 |
+
else:
|
134 |
+
results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=False)
|
135 |
+
|
136 |
+
word_level_df_out = word_level_df_orig.loc[word_level_df_orig["index"].isin(results_df_index_list)]
|
137 |
+
|
138 |
+
return word_level_df_out, duplicate_files, full_data
|
139 |
+
|
140 |
+
def create_all_data_to_process(converted_data:pd.DataFrame, other_data_list:List[Tuple]):
|
141 |
+
all_data_to_process = converted_data + other_data_list
|
142 |
+
return all_data_to_process
|
143 |
+
|
144 |
+
def convert_word_level_df(
|
145 |
+
word_level_df: pd.DataFrame,
|
146 |
+
file_name: str = "converted_dataframe"
|
147 |
+
) -> List[Tuple[str, pd.DataFrame]]:
|
148 |
+
"""
|
149 |
+
Converts a word-level OCR DataFrame to the format for
|
150 |
+
combine_ocr_dataframes.
|
151 |
+
|
152 |
+
A simple renaming and selection of relevant columns
|
153 |
+
|
154 |
+
Args:
|
155 |
+
word_level_df (pd.DataFrame):
|
156 |
+
A DataFrame containing detailed OCR output. Must include at least
|
157 |
+
the columns: 'page', 'line', and 'word_text'.
|
158 |
+
file_name (str, optional):
|
159 |
+
A unique identifier or "dummy" filename to assign to the resulting
|
160 |
+
data. Defaults to "converted_dataframe".
|
161 |
+
|
162 |
Returns:
|
163 |
+
List[Tuple[str, pd.DataFrame]]:
|
164 |
+
A list containing a single tuple of (file_name, DataFrame), ready
|
165 |
+
to be used as input for the combine_ocr_dataframes function. The
|
166 |
+
DataFrame will have 'page' and 'text' columns.
|
167 |
"""
|
168 |
+
# --- 1. Validate Input ---
|
169 |
+
required_columns = ['page', 'line', 'word_text']
|
170 |
+
if not all(col in word_level_df.columns for col in required_columns):
|
171 |
+
raise ValueError(f"Input DataFrame must contain all of the following columns: {required_columns}")
|
172 |
|
173 |
+
df = word_level_df.copy()
|
174 |
+
|
175 |
+
# --- 2. Process the DataFrame ---
|
176 |
+
# Ensure word_text is a string to allow for joining
|
177 |
+
df['word_text'] = df['word_text'].astype(str)
|
178 |
+
|
179 |
+
# Group by page and line number, then join the words with a space (not needed for word level search)
|
180 |
+
# The result is a Series with a MultiIndex (page, line)
|
181 |
+
#line_text_series = df.groupby(['page', 'line'])['word_text'].apply(' '.join)
|
182 |
+
|
183 |
+
# Convert the Series back to a DataFrame and reset the index
|
184 |
+
#line_level_df = line_text_series.reset_index()
|
185 |
+
|
186 |
+
# Rename the aggregated column from 'word_text' to the required 'text'
|
187 |
+
df = df.rename(columns={'word_text': 'text'})
|
188 |
+
|
189 |
+
# --- 3. Finalise the structure ---
|
190 |
+
# We now have a DataFrame with columns [page, line, text].
|
191 |
+
final_df = df[['page', 'text']]
|
192 |
+
|
193 |
+
# --- 4. Package for output ---
|
194 |
+
# Return in the required List[Tuple[str, DataFrame]] format
|
195 |
+
return [(file_name, final_df)]
|
196 |
+
|
197 |
+
def create_dataframe_from_string(
|
198 |
+
text_string: str,
|
199 |
+
file_name: str = "user_search_query",
|
200 |
+
page_number: int = 1,
|
201 |
+
split_words: bool = False
|
202 |
+
) -> Tuple[List[Tuple[str, pd.DataFrame]], int]:
|
203 |
+
"""
|
204 |
+
Converts a string into a DataFrame compatible with combine_ocr_dataframes.
|
205 |
+
|
206 |
+
Can operate in two modes:
|
207 |
+
1. As a single-line document (default).
|
208 |
+
2. As a multi-line document where each word from the string is a separate line.
|
209 |
+
|
210 |
+
Args:
|
211 |
+
text_string (str): The input text to be placed in the DataFrame.
|
212 |
+
file_name (str, optional): A dummy filename to assign to this text.
|
213 |
+
Defaults to "user_search_query".
|
214 |
+
page_number (int, optional): A dummy page number to assign. Defaults to 1.
|
215 |
+
split_words (bool, optional): If True, splits the input string by
|
216 |
+
whitespace and creates a row for each word.
|
217 |
+
If False (default), the entire string is
|
218 |
+
treated as a single text entry.
|
219 |
+
|
220 |
+
Returns:
|
221 |
+
Tuple[List[Tuple[str, pd.DataFrame]], int]:
|
222 |
+
A list containing a single tuple: (file_name, DataFrame).
|
223 |
+
The DataFrame has 'page' and 'text' columns. Also, an integer value indicating the number of words in the search string.
|
224 |
+
Returns an empty list if the input string is empty or whitespace.
|
225 |
+
"""
|
226 |
+
# Handle empty input gracefully, this works for both modes.
|
227 |
+
if not text_string or not text_string.strip():
|
228 |
+
print("Warning: Input string is empty. Returning an empty list.")
|
229 |
+
return [], 0
|
230 |
+
|
231 |
+
if split_words:
|
232 |
+
# --- MODE 2: Split string into words, one per row ---
|
233 |
+
words = text_string.split()
|
234 |
+
len_words = len(words)
|
235 |
+
data = {
|
236 |
+
# Assign the same page number to every word
|
237 |
+
'page': [page_number] * len(words),
|
238 |
+
# The list of words becomes the text column
|
239 |
+
'text': words
|
240 |
+
}
|
241 |
else:
|
242 |
+
# --- MODE 1: Original behavior, entire string in one row ---
|
243 |
+
len_words = 1
|
244 |
+
data = {
|
245 |
+
'page': [page_number],
|
246 |
+
'text': [text_string]
|
247 |
+
}
|
248 |
|
249 |
+
# Create the DataFrame from the prepared data
|
250 |
+
df = pd.DataFrame(data)
|
|
|
|
|
251 |
|
252 |
+
df["line"] = df.index + 1
|
253 |
+
|
254 |
+
# Return it in the required format: a list containing one (name, df) tuple
|
255 |
+
return [(file_name, df)], len_words
|
256 |
+
|
257 |
+
def combine_ocr_dataframes(
|
258 |
+
input_data: List[Tuple[str, pd.DataFrame]],
|
259 |
+
combine_pages: bool = True,
|
260 |
+
output_folder: str = OUTPUT_FOLDER,
|
261 |
+
output_filename: str = "combined_ocr_output.csv",
|
262 |
+
number_of_added_zeros: int = number_of_zeros_to_add_to_index,
|
263 |
+
remake_index:bool = True
|
264 |
+
) -> Tuple[pd.DataFrame, List[str]]:
|
265 |
+
"""
|
266 |
+
Combines text from multiple pandas DataFrames containing page and text columns.
|
267 |
+
|
268 |
+
This function takes a list of (name, DataFrame) tuples, processes each DataFrame
|
269 |
+
by grouping and concatenating text, and then combines them into a single DataFrame.
|
270 |
+
|
271 |
+
Args:
|
272 |
+
input_data (List[Tuple[str, pd.DataFrame]]):
|
273 |
+
A list of tuples, where each tuple contains a unique identifier (like a filename)
|
274 |
+
and a pandas DataFrame. Each DataFrame must have 'page' and 'text' columns.
|
275 |
+
combine_pages (bool, optional):
|
276 |
+
If True, text from the same page number within a file is joined into a
|
277 |
+
single row. If False, each line of text gets its own row with a unique
|
278 |
+
page identifier. Defaults to True.
|
279 |
+
output_folder (str, optional):
|
280 |
+
The folder where the combined CSV file will be saved. Defaults to OUTPUT_FOLDER.
|
281 |
+
output_filename (str, optional):
|
282 |
+
The name of the output CSV file. Defaults to "combined_ocr_output.csv".
|
283 |
+
|
284 |
+
Returns:
|
285 |
+
Tuple[pd.DataFrame, List[str]]:
|
286 |
+
A tuple containing:
|
287 |
+
- The final combined and processed DataFrame.
|
288 |
+
- A list containing the path to the saved output CSV file.
|
289 |
+
"""
|
290 |
+
all_data = []
|
291 |
+
|
292 |
+
for file_identifier, df_initial in input_data:
|
293 |
+
df = df_initial.copy() # Work on a copy to avoid side effects
|
294 |
+
|
295 |
+
# --- Validation ---
|
296 |
if 'page' not in df.columns or 'text' not in df.columns:
|
297 |
+
print(f"Warning: Skipping data for '{file_identifier}' - missing required columns 'page' and 'text'.")
|
298 |
continue
|
299 |
|
300 |
+
# --- Processing ---
|
301 |
df['text'] = df['text'].fillna('').astype(str)
|
302 |
+
|
303 |
+
if combine_pages:
|
304 |
+
# Group by page and concatenate text into a single string
|
305 |
+
processed_df = df.groupby('page')['text'].apply(' '.join).reset_index()
|
306 |
else:
|
307 |
+
if remake_index == True:
|
308 |
+
# # Create a unique, sortable page ID for each line without combining
|
309 |
+
# df['line_number_by_page'] = df.groupby('page').cumcount() + 1
|
310 |
+
# df['original_page'] = df['page']
|
311 |
+
# # Create a new page ID that combines page and line number for uniqueness
|
312 |
+
# df['page'] = (
|
313 |
+
# df['page'].astype(str).str.zfill(number_of_added_zeros) +
|
314 |
+
# df['line_number_by_page'].astype(str).str.zfill(number_of_added_zeros)
|
315 |
+
# ).astype(int)
|
316 |
+
|
317 |
+
# Define the multiplier based on the max expected lines per page.
|
318 |
+
# If you expect up to 99,999 lines, use 100,000.
|
319 |
+
|
320 |
+
df['line_number_by_page'] = df.groupby('page').cumcount() + 1
|
321 |
+
df['original_page'] = df['page']
|
322 |
+
|
323 |
+
# Create the new combined ID using arithmetic
|
324 |
+
df['page'] = (df['original_page'] * ID_MULTIPLIER) + df['line_number_by_page']
|
325 |
+
|
326 |
+
else:
|
327 |
+
if not 'index' in df.columns:
|
328 |
+
df['index'] = df.index
|
329 |
+
df['page'] = df['index']
|
330 |
+
|
331 |
+
processed_df = df
|
332 |
+
|
333 |
+
# Add the file identifier column
|
334 |
+
processed_df['file'] = file_identifier
|
335 |
+
all_data.append(processed_df)
|
336 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
337 |
if not all_data:
|
338 |
+
raise ValueError("No valid DataFrames were processed. Ensure input data is not empty and DataFrames have 'page' and 'text' columns.")
|
339 |
+
|
340 |
+
# --- Final Combination ---
|
341 |
combined_df = pd.concat(all_data, ignore_index=True)
|
342 |
+
|
343 |
+
# Reorder columns to a standard format, dropping intermediate columns
|
344 |
+
final_columns = ['file', 'page', 'text']
|
345 |
+
if 'original_page' in combined_df.columns:
|
346 |
+
final_columns.append('original_page') # Keep for context if created
|
347 |
|
348 |
+
# Ensure all final columns exist before trying to select them
|
349 |
+
existing_final_columns = [col for col in final_columns if col in combined_df.columns]
|
350 |
|
351 |
+
full_out_ocr_df = combined_df
|
352 |
+
combined_df = combined_df.copy()[existing_final_columns]
|
353 |
|
354 |
+
# --- Save Output ---
|
355 |
+
output_files = []
|
356 |
+
if output_folder and output_filename:
|
357 |
+
os.makedirs(output_folder, exist_ok=True)
|
358 |
+
output_path = os.path.join(output_folder, output_filename)
|
359 |
+
combined_df.to_csv(output_path, index=False)
|
360 |
+
output_files.append(output_path)
|
361 |
+
print(f"Successfully combined data and saved to: {output_path}")
|
362 |
+
|
363 |
+
return combined_df, output_files, full_out_ocr_df
|
364 |
+
|
365 |
+
def combine_ocr_output_text(
|
366 |
+
input_files: Union[str, List[str]],
|
367 |
+
combine_pages: bool = True,
|
368 |
+
remake_index: bool = True,
|
369 |
+
output_folder: str = OUTPUT_FOLDER
|
370 |
+
) -> Tuple[pd.DataFrame, List[str]]:
|
371 |
+
"""
|
372 |
+
Reads multiple OCR CSV files, combines them, and saves the result.
|
373 |
+
|
374 |
+
This function serves as a wrapper that reads CSV files from paths and then
|
375 |
+
uses the `combine_ocr_dataframes` function to perform the combination logic.
|
376 |
+
|
377 |
+
Args:
|
378 |
+
input_files (Union[str, List[str]]): A single file path or a list of file paths.
|
379 |
+
combine_pages (bool, optional): See `combine_ocr_dataframes`. Defaults to True.
|
380 |
+
output_folder (str, optional): See `combine_ocr_dataframes`. Defaults to OUTPUT_FOLDER.
|
381 |
+
|
382 |
+
Returns:
|
383 |
+
Tuple[pd.DataFrame, List[str]]: The combined DataFrame and the path to the output file.
|
384 |
+
"""
|
385 |
+
if isinstance(input_files, str):
|
386 |
+
file_paths_list = [input_files]
|
387 |
+
else:
|
388 |
+
file_paths_list = input_files
|
389 |
+
|
390 |
+
data_to_process = []
|
391 |
+
for file_path in file_paths_list:
|
392 |
+
try:
|
393 |
+
df = pd.read_csv(file_path)
|
394 |
+
# Use the base filename as the identifier
|
395 |
+
file_identifier = os.path.basename(file_path)
|
396 |
+
data_to_process.append((file_identifier, df))
|
397 |
+
except FileNotFoundError:
|
398 |
+
print(f"Warning: File not found, skipping: {file_path}")
|
399 |
+
except Exception as e:
|
400 |
+
print(f"Warning: Failed to read or process {file_path}. Error: {e}")
|
401 |
|
402 |
+
if not data_to_process:
|
403 |
+
raise ValueError("No valid CSV files could be read or processed.")
|
404 |
+
|
405 |
+
# Call the core function with the loaded data
|
406 |
+
return combine_ocr_dataframes(
|
407 |
+
input_data=data_to_process,
|
408 |
+
combine_pages=combine_pages,
|
409 |
+
output_folder=output_folder,
|
410 |
+
output_filename="combined_ocr_from_files.csv", # Specific name for this path
|
411 |
+
remake_index=remake_index
|
412 |
+
)
|
413 |
+
|
414 |
+
def clean_and_stem_text_series(df:pd.DataFrame, column:str):
|
415 |
'''
|
416 |
Clean and stem text columns in a data frame
|
417 |
'''
|
|
|
539 |
|
540 |
return output_paths
|
541 |
|
542 |
+
# def identify_similar_text_sequences(
|
543 |
+
# df_combined: pd.DataFrame,
|
544 |
+
# similarity_threshold: float = 0.9,
|
545 |
+
# min_word_count: int = 10,
|
546 |
+
# min_consecutive_pages: int = 1,
|
547 |
+
# greedy_match: bool = False,
|
548 |
+
# combine_pages:bool=True,
|
549 |
+
# output_folder: str = OUTPUT_FOLDER,
|
550 |
+
# progress=Progress(track_tqdm=True)
|
551 |
+
# ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
552 |
+
# """
|
553 |
+
# Identifies similar pages with three possible strategies:
|
554 |
+
# 1. Single Page: If greedy_match=False and min_consecutive_pages=1.
|
555 |
+
# 2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
|
556 |
+
# 3. Greedy Consecutive Match: If greedy_match=True.
|
557 |
+
# """
|
558 |
+
|
559 |
+
# output_paths = []
|
560 |
+
# progress(0.1, desc="Processing and filtering text")
|
561 |
+
# df = clean_and_stem_text_series(df_combined, 'text')
|
562 |
+
# df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
|
563 |
+
# original_row_count = len(df)
|
564 |
+
# df_filtered = df[df['word_count'] >= min_word_count].copy()
|
565 |
+
# df_filtered.reset_index(drop=True, inplace=True)
|
566 |
+
|
567 |
+
# print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
|
568 |
+
|
569 |
+
# if len(df_filtered) < 2:
|
570 |
+
# return pd.DataFrame(), [], df_combined
|
571 |
+
|
572 |
+
# vectorizer = TfidfVectorizer()
|
573 |
+
# tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
|
574 |
+
|
575 |
+
# progress(0.3, desc="Calculating text similarity")
|
576 |
+
# similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
|
577 |
+
# coo_matrix = similarity_matrix.tocoo()
|
578 |
+
|
579 |
+
# # Create a DataFrame of all individual page pairs above the threshold.
|
580 |
+
# # This is the base for all three matching strategies.
|
581 |
+
# similar_pages = [
|
582 |
+
# (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
|
583 |
+
# if r < c and v >= similarity_threshold
|
584 |
+
# ]
|
585 |
+
|
586 |
+
# if not similar_pages:
|
587 |
+
# return pd.DataFrame(), [], df_combined
|
588 |
+
|
589 |
+
# base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
590 |
+
|
591 |
+
# progress(0.6, desc="Aggregating results based on matching strategy")
|
592 |
+
|
593 |
+
# if greedy_match:
|
594 |
+
# print("Finding matches using greedy consecutive strategy.")
|
595 |
+
|
596 |
+
# # A set of pairs for fast lookups of (page1_idx, page2_idx)
|
597 |
+
# valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
|
598 |
+
|
599 |
+
# # Keep track of indices that have been used in a sequence
|
600 |
+
# consumed_indices_1 = set()
|
601 |
+
# consumed_indices_2 = set()
|
602 |
+
|
603 |
+
# all_sequences = []
|
604 |
+
|
605 |
+
# # Iterate through all potential starting pairs, sorted for consistent results
|
606 |
+
# sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
|
607 |
+
|
608 |
+
# for _, row in sorted_pairs.iterrows():
|
609 |
+
# start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
|
610 |
+
|
611 |
+
# # If this pair has already been consumed by a previous sequence, skip it
|
612 |
+
# if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
|
613 |
+
# continue
|
614 |
+
|
615 |
+
# # This is a new sequence, start expanding it
|
616 |
+
# current_sequence = [(start_idx1, start_idx2)]
|
617 |
+
# k = 1
|
618 |
+
# while True:
|
619 |
+
# next_idx1 = start_idx1 + k
|
620 |
+
# next_idx2 = start_idx2 + k
|
621 |
+
|
622 |
+
# # Check if the next pair in the sequence is a valid match
|
623 |
+
# if (next_idx1, next_idx2) in valid_pairs_set and \
|
624 |
+
# next_idx1 not in consumed_indices_1 and \
|
625 |
+
# next_idx2 not in consumed_indices_2:
|
626 |
+
# current_sequence.append((next_idx1, next_idx2))
|
627 |
+
# k += 1
|
628 |
+
# else:
|
629 |
+
# # The sequence has ended
|
630 |
+
# break
|
631 |
+
|
632 |
+
# # Record the found sequence and mark all its pages as consumed
|
633 |
+
# sequence_indices_1 = [p[0] for p in current_sequence]
|
634 |
+
# sequence_indices_2 = [p[1] for p in current_sequence]
|
635 |
+
|
636 |
+
# all_sequences.append({
|
637 |
+
# 'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
|
638 |
+
# 'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
|
639 |
+
# 'Match_Length': len(current_sequence)
|
640 |
+
# })
|
641 |
+
|
642 |
+
# consumed_indices_1.update(sequence_indices_1)
|
643 |
+
# consumed_indices_2.update(sequence_indices_2)
|
644 |
+
|
645 |
+
# if not all_sequences:
|
646 |
+
# return pd.DataFrame(), [], df_combined
|
647 |
+
|
648 |
+
# subdocument_df = pd.DataFrame(all_sequences)
|
649 |
+
|
650 |
+
# elif min_consecutive_pages > 1:
|
651 |
+
# # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
|
652 |
+
# print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
|
653 |
+
# similarity_df = base_similarity_df.copy()
|
654 |
+
# similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
|
655 |
+
# is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
|
656 |
+
# block_id = is_consecutive.eq(False).cumsum()
|
657 |
+
# grouped = similarity_df.groupby(block_id)
|
658 |
+
# agg_results = grouped.agg(
|
659 |
+
# Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
|
660 |
+
# Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
|
661 |
+
# Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
|
662 |
+
# ).reset_index(drop=True)
|
663 |
+
# subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
|
664 |
+
# if subdocument_df.empty: return pd.DataFrame(), [], df_combined
|
665 |
+
|
666 |
+
# else:
|
667 |
+
# # --- STRATEGY 1: Single Page Matching ---
|
668 |
+
# print(f"Finding single page matches (min_consecutive_pages=1)")
|
669 |
+
# final_df = map_metadata_single_page(base_similarity_df, df_filtered)
|
670 |
+
# # The rest of the logic (saving files) is handled after this if/else block
|
671 |
+
# pass # The final_df is already prepared
|
672 |
+
|
673 |
+
# # --- Map metadata and format output ---
|
674 |
+
# # This block now handles the output for both subdocument strategies (2 and 3)
|
675 |
+
# if greedy_match or min_consecutive_pages > 1:
|
676 |
+
# final_df = map_metadata_subdocument(subdocument_df, df_filtered)
|
677 |
+
|
678 |
+
# progress(0.8, desc="Saving output files")
|
679 |
+
|
680 |
+
# output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
|
681 |
+
|
682 |
+
# return final_df, output_paths, df_combined
|
683 |
+
|
684 |
+
def _calculate_inter_file_similarity(df_filtered:pd.DataFrame, vectorizer, similarity_threshold:float, progress:gr.Progress):
|
685 |
+
"""
|
686 |
+
Helper function to efficiently calculate similarity ONLY between different files.
|
687 |
+
"""
|
688 |
+
print("Calculating inter-file similarity.")
|
689 |
+
|
690 |
+
# Step 1: Fit the vectorizer on the ENTIRE corpus to create a shared vocabulary.
|
691 |
+
# This is crucial for comparing vectors from different files meaningfully.
|
692 |
+
progress(0.2, desc="Building vocabulary...")
|
693 |
+
vectorizer.fit(df_filtered['text_clean'])
|
694 |
+
|
695 |
+
# Step 2: Group the DataFrame by file.
|
696 |
+
file_groups = df_filtered.groupby('file')
|
697 |
+
unique_files = list(file_groups.groups.keys())
|
698 |
+
all_similar_pairs = []
|
699 |
+
|
700 |
+
# Step 3: Iterate through all unique pairs of files.
|
701 |
+
file_combinations = list(itertools.combinations(unique_files, 2))
|
702 |
+
|
703 |
+
for i, (file1_name, file2_name) in enumerate(progress.tqdm(file_combinations, desc="Comparing file pairs")):
|
704 |
+
group1_df = file_groups.get_group(file1_name)
|
705 |
+
group2_df = file_groups.get_group(file2_name)
|
706 |
+
|
707 |
+
# Step 4: Use the pre-fitted vectorizer to TRANSFORM (not fit_transform) the text of each group.
|
708 |
+
tfidf1 = vectorizer.transform(group1_df['text_clean'])
|
709 |
+
tfidf2 = vectorizer.transform(group2_df['text_clean'])
|
710 |
+
|
711 |
+
# Step 5: Calculate similarity between the two groups.
|
712 |
+
# This is a much smaller matrix (pages_in_file1 x pages_in_file2).
|
713 |
+
similarity_matrix_subset = cosine_similarity(tfidf1, tfidf2)
|
714 |
+
|
715 |
+
# Step 6: Find pairs in this sub-matrix that exceed the threshold.
|
716 |
+
# `np.where` is highly efficient for this.
|
717 |
+
page1_indices, page2_indices = np.where(similarity_matrix_subset >= similarity_threshold)
|
718 |
+
|
719 |
+
# Step 7: Map the local indices back to the original global indices from `df_filtered`.
|
720 |
+
for p1_local_idx, p2_local_idx in zip(page1_indices, page2_indices):
|
721 |
+
global_idx1 = group1_df.index[p1_local_idx]
|
722 |
+
global_idx2 = group2_df.index[p2_local_idx]
|
723 |
+
score = similarity_matrix_subset[p1_local_idx, p2_local_idx]
|
724 |
+
|
725 |
+
# Ensure r < c convention to match the original method
|
726 |
+
r, c = min(global_idx1, global_idx2), max(global_idx1, global_idx2)
|
727 |
+
all_similar_pairs.append((r, c, score))
|
728 |
+
|
729 |
+
if not all_similar_pairs:
|
730 |
+
return pd.DataFrame()
|
731 |
+
|
732 |
+
# Create the final DataFrame, which is now pre-filtered.
|
733 |
+
return pd.DataFrame(all_similar_pairs, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
734 |
+
|
735 |
+
|
736 |
+
def identify_similar_text_sequences(
|
737 |
df_combined: pd.DataFrame,
|
738 |
+
similarity_threshold: float = 1,
|
739 |
+
min_word_count: int = 1,
|
740 |
min_consecutive_pages: int = 1,
|
741 |
+
greedy_match: bool = True,
|
742 |
+
combine_pages: bool = False,
|
743 |
+
inter_file_only: bool = False,
|
744 |
+
do_text_clean:bool = True,
|
745 |
+
output_folder: str = "output/",
|
746 |
progress=Progress(track_tqdm=True)
|
747 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
748 |
"""
|
749 |
+
Identifies similar pages. Uses a highly optimized path for inter_file_only=True.
|
|
|
|
|
|
|
750 |
"""
|
|
|
|
|
751 |
progress(0.1, desc="Processing and filtering text")
|
752 |
+
|
753 |
+
if do_text_clean:
|
754 |
+
df = clean_and_stem_text_series(df_combined, 'text') # Will produce the column 'text_clean'
|
755 |
+
else:
|
756 |
+
df = df_combined.copy()
|
757 |
+
df['text_clean'] = df['text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
|
758 |
+
|
759 |
df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
|
760 |
+
|
761 |
original_row_count = len(df)
|
762 |
df_filtered = df[df['word_count'] >= min_word_count].copy()
|
763 |
df_filtered.reset_index(drop=True, inplace=True)
|
|
|
764 |
print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
|
|
|
765 |
if len(df_filtered) < 2:
|
766 |
return pd.DataFrame(), [], df_combined
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
767 |
|
768 |
+
vectorizer = TfidfVectorizer()
|
769 |
|
770 |
+
# Similarity calculated differently if comparing between files only (inter_file_only==True), or within the same file
|
771 |
+
if inter_file_only:
|
772 |
+
# Use the new, highly efficient helper function.
|
773 |
+
base_similarity_df = _calculate_inter_file_similarity(df_filtered, vectorizer, similarity_threshold, progress)
|
774 |
+
if base_similarity_df.empty:
|
775 |
+
return pd.DataFrame(), [], df_combined
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
776 |
|
777 |
+
else:
|
778 |
+
# Use the original, simpler path for all-to-all comparisons (including intra-file).
|
779 |
+
print("Standard Path: Calculating all-to-all similarity.")
|
780 |
+
progress(0.2, desc="Vectorizing text...")
|
781 |
+
tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
|
782 |
|
783 |
+
progress(0.3, desc="Calculating similarity matrix...")
|
784 |
+
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
|
785 |
+
coo_matrix = similarity_matrix.tocoo()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
786 |
|
787 |
+
similar_pages = [
|
788 |
+
(r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
|
789 |
+
if r < c and v >= similarity_threshold
|
790 |
+
]
|
791 |
|
792 |
+
if not similar_pages:
|
793 |
return pd.DataFrame(), [], df_combined
|
794 |
+
|
795 |
+
base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
796 |
+
|
797 |
+
progress(0.6, desc="Aggregating results based on matching strategy")
|
798 |
|
799 |
+
if greedy_match or min_consecutive_pages > 1:
|
800 |
+
print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
|
801 |
+
|
802 |
+
# Sort the dataframe to ensure consecutive pages are adjacent
|
803 |
+
similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
|
804 |
|
805 |
+
# A new sequence starts if the difference from the previous row is not (1, 1)
|
806 |
+
# is_consecutive will be True if a row continues the sequence, False if it's a new one.
|
|
|
|
|
|
|
807 |
is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
|
808 |
+
|
809 |
+
# Use cumsum() on the inverted boolean series to create a unique ID for each block.
|
810 |
+
# Every time a 'False' appears (a new block starts), the sum increases.
|
811 |
block_id = is_consecutive.eq(False).cumsum()
|
812 |
+
|
813 |
+
# Group by this block ID
|
814 |
grouped = similarity_df.groupby(block_id)
|
815 |
+
|
816 |
+
# Aggregate each group to get the start, end, and length of the match
|
817 |
agg_results = grouped.agg(
|
818 |
+
Page1_Start_Index=('Page1_Index', 'first'),
|
819 |
+
Page2_Start_Index=('Page2_Index', 'first'),
|
820 |
+
Page1_End_Index=('Page1_Index', 'last'),
|
821 |
+
Page2_End_Index=('Page2_Index', 'last'),
|
822 |
+
Match_Length=('Page1_Index', 'size'),
|
823 |
+
Avg_Similarity=('Similarity_Score', 'mean')
|
824 |
).reset_index(drop=True)
|
|
|
|
|
825 |
|
826 |
+
# If greedy_match=True, we keep all matches. If min_consecutive_pages > 1, we filter.
|
827 |
+
if greedy_match and min_consecutive_pages <= 1:
|
828 |
+
subdocument_df = agg_results
|
829 |
+
else:
|
830 |
+
# This handles the case for min_consecutive_pages > 1
|
831 |
+
subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
|
832 |
+
|
833 |
+
if subdocument_df.empty:
|
834 |
+
gr.Info("No matches found")
|
835 |
+
return pd.DataFrame(), [], df_combined
|
836 |
+
|
837 |
+
final_df = map_metadata_subdocument(subdocument_df, df_filtered)
|
838 |
else:
|
839 |
+
print(f"Finding single page matches, not greedy (min_consecutive_pages=1)")
|
840 |
+
# This part of your code would handle the non-sequential case
|
841 |
final_df = map_metadata_single_page(base_similarity_df, df_filtered)
|
842 |
+
subdocument_df = final_df # To align variable names for saving
|
843 |
+
|
844 |
+
if subdocument_df.empty:
|
845 |
+
gr.Info("No matches found")
|
846 |
+
return pd.DataFrame(), [], df_combined
|
847 |
|
|
|
|
|
|
|
|
|
|
|
848 |
progress(0.8, desc="Saving output files")
|
849 |
|
850 |
output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
|
|
|
901 |
Wrapper function updated to include the 'greedy_match' boolean.
|
902 |
"""
|
903 |
if not files:
|
904 |
+
raise Warning("Please upload files to analyse.")
|
|
|
905 |
|
906 |
progress(0, desc="Combining input files...")
|
907 |
+
df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
|
908 |
|
909 |
if df_combined.empty:
|
910 |
+
raise Warning("No data found in the uploaded files.")
|
|
|
911 |
|
912 |
# Call the main analysis function with the new parameter
|
913 |
+
results_df, output_paths, full_df = identify_similar_text_sequences(
|
914 |
df_combined=df_combined,
|
915 |
similarity_threshold=threshold,
|
916 |
min_word_count=min_words,
|
|
|
922 |
|
923 |
# Clip text to first 200 characters
|
924 |
full_df['text'] = full_df['text'].str[:preview_length]
|
|
|
925 |
# Preprocess full_data (without preview text) for fast access (run once)
|
926 |
full_data_by_file = {
|
927 |
file: df.sort_values('page').set_index('page')
|
|
|
931 |
if results_df.empty:
|
932 |
gr.Info(f"No duplicate pages found, no results returned.")
|
933 |
|
934 |
+
return results_df, output_paths, full_data_by_file
|
935 |
|
936 |
def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
|
937 |
"""
|
|
|
1016 |
|
1017 |
return all_annotations, newly_added_annotation_group
|
1018 |
|
1019 |
+
def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=list()):
|
1020 |
'''
|
1021 |
+
This function applies redactions to whole pages based on a provided list of duplicate page numbers. It supports two modes of operation: combining pages and not combining pages. When combining pages is enabled, it attempts to identify duplicate pages across different files and applies redactions accordingly. If combining pages is disabled, it relies on new annotations with bounding boxes to determine which pages to redact. The function utilises a PyMuPDF document object to manipulate the PDF file, and it also considers the sizes of pages to ensure accurate redaction application.
|
1022 |
+
|
1023 |
+
Args:
|
1024 |
+
duplicate_page_numbers_df (pd.DataFrame): A DataFrame containing page numbers identified as duplicates.
|
1025 |
+
doc_file_name_with_extension_textbox (str): The name of the document file with its extension.
|
1026 |
+
review_file_state (pd.DataFrame): The current state of the review file.
|
1027 |
+
duplicate_output_paths (list[str]): A list of paths to files containing duplicate page information.
|
1028 |
+
pymupdf_doc (object): A PyMuPDF document object representing the PDF file.
|
1029 |
+
page_sizes (list[dict]): A list of dictionaries containing page size information.
|
1030 |
+
all_existing_annotations (list[dict]): A list of all existing annotations in the document.
|
1031 |
+
combine_pages (bool, optional): A flag indicating whether to combine pages for redaction. Defaults to True.
|
1032 |
+
new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
|
1033 |
'''
|
1034 |
+
if all_existing_annotations is None:
|
1035 |
+
all_existing_annotations = []
|
1036 |
+
|
1037 |
+
if new_annotations_with_bounding_boxes is None:
|
1038 |
+
new_annotations_with_bounding_boxes = []
|
1039 |
+
|
1040 |
+
print("new_annotations_with_bounding_boxes:", new_annotations_with_bounding_boxes)
|
1041 |
+
|
1042 |
all_annotations = all_existing_annotations.copy()
|
1043 |
|
1044 |
if not pymupdf_doc:
|
|
|
1171 |
review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
|
1172 |
review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
|
1173 |
|
1174 |
+
out_message = "Successfully created duplicate text redactions."
|
1175 |
print(out_message)
|
1176 |
gr.Info(out_message)
|
1177 |
|
1178 |
return review_file_out, all_annotations
|
1179 |
|
|
|
1180 |
def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
|
1181 |
+
"""Parses a combined ID using modular arithmetic."""
|
1182 |
+
if int(combined_id) < ID_MULTIPLIER:
|
1183 |
+
# Handle cases where page is 0 (or just an edge case)
|
1184 |
+
return 0, combined_id
|
1185 |
+
|
1186 |
+
page = combined_id // ID_MULTIPLIER
|
1187 |
+
line = combined_id % ID_MULTIPLIER
|
|
|
|
|
|
|
|
|
|
|
1188 |
return page, line
|
1189 |
|
1190 |
def create_annotation_objects_from_duplicates(
|
1191 |
duplicates_df: pd.DataFrame,
|
1192 |
ocr_results_df: pd.DataFrame,
|
1193 |
page_sizes: List[Dict],
|
1194 |
+
combine_pages:bool=False) -> List[Dict]:
|
|
|
1195 |
"""
|
1196 |
Creates structured annotation objects from duplicate line ranges, mapping
|
1197 |
page numbers to image paths.
|
|
|
1207 |
"""
|
1208 |
final_output = []
|
1209 |
|
1210 |
+
if duplicates_df.empty:
|
1211 |
+
raise Warning("No duplicates found")
|
1212 |
+
if ocr_results_df.empty:
|
1213 |
+
raise Warning("No OCR results found for file under review. Please upload relevant OCR_output file for the PDF file on the review tab.")
|
1214 |
+
|
1215 |
if combine_pages == False:
|
|
|
1216 |
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
1217 |
|
1218 |
# Prepare OCR Data: Add a line number column if it doesn't exist
|
tools/helper_functions.py
CHANGED
@@ -22,7 +22,7 @@ def reset_state_vars():
|
|
22 |
show_share_button=False,
|
23 |
show_remove_button=False,
|
24 |
interactive=False
|
25 |
-
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0
|
26 |
|
27 |
def reset_ocr_results_state():
|
28 |
return pd.DataFrame(), pd.DataFrame(), []
|
@@ -573,7 +573,10 @@ def reset_base_dataframe(df:pd.DataFrame):
|
|
573 |
return df
|
574 |
|
575 |
def reset_ocr_base_dataframe(df:pd.DataFrame):
|
576 |
-
|
|
|
|
|
|
|
577 |
|
578 |
def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_redaction_value:str):
|
579 |
|
|
|
22 |
show_share_button=False,
|
23 |
show_remove_button=False,
|
24 |
interactive=False
|
25 |
+
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0, []
|
26 |
|
27 |
def reset_ocr_results_state():
|
28 |
return pd.DataFrame(), pd.DataFrame(), []
|
|
|
573 |
return df
|
574 |
|
575 |
def reset_ocr_base_dataframe(df:pd.DataFrame):
|
576 |
+
if df.empty:
|
577 |
+
return pd.DataFrame()
|
578 |
+
else:
|
579 |
+
return df.loc[:, ["page", "text"]]
|
580 |
|
581 |
def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_redaction_value:str):
|
582 |
|
tools/redaction_review.py
CHANGED
@@ -404,6 +404,68 @@ def _generate_unique_ids(
|
|
404 |
|
405 |
return list(newly_generated_ids)
|
406 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
407 |
def create_annotation_objects_from_filtered_ocr_results_with_words(
|
408 |
filtered_ocr_results_with_words_df: pd.DataFrame,
|
409 |
ocr_results_with_words_df_base: pd.DataFrame,
|
@@ -411,54 +473,45 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
411 |
existing_annotations_df: pd.DataFrame,
|
412 |
existing_annotations_list: List[Dict],
|
413 |
existing_recogniser_entity_df: pd.DataFrame,
|
414 |
-
progress=gr.Progress(track_tqdm=True)
|
415 |
) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
416 |
"""
|
417 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
"""
|
419 |
|
|
|
420 |
print("Identifying new redactions to add")
|
421 |
-
progress(0.1, "Identifying new redactions to add")
|
422 |
if filtered_ocr_results_with_words_df.empty:
|
423 |
print("No new annotations to add.")
|
424 |
updated_annotations_df = existing_annotations_df.copy()
|
425 |
else:
|
426 |
-
#
|
427 |
-
# new_annotations_df = pd.merge(
|
428 |
-
# ocr_results_with_words_df_base,
|
429 |
-
# filtered_ocr_results_with_words_df[join_keys],
|
430 |
-
# on=join_keys,
|
431 |
-
# how='inner'
|
432 |
-
# )
|
433 |
-
|
434 |
filtered_ocr_results_with_words_df.index = filtered_ocr_results_with_words_df["index"]
|
435 |
-
|
436 |
new_annotations_df = ocr_results_with_words_df_base.loc[filtered_ocr_results_with_words_df.index].copy()
|
437 |
|
438 |
if new_annotations_df.empty:
|
439 |
print("No new annotations to add.")
|
440 |
updated_annotations_df = existing_annotations_df.copy()
|
441 |
else:
|
442 |
-
# --- Custom ID Generation ---
|
443 |
-
progress(0.2, "Creating new redaction IDs")
|
444 |
-
# 1. Get all IDs that already exist to ensure we don't create duplicates.
|
445 |
-
existing_ids = set()
|
446 |
-
if 'id' in existing_annotations_df.columns:
|
447 |
-
existing_ids = set(existing_annotations_df['id'].dropna())
|
448 |
-
|
449 |
-
# 2. Generate the exact number of new, unique IDs required.
|
450 |
-
num_new_ids = len(new_annotations_df)
|
451 |
-
new_id_list = _generate_unique_ids(num_new_ids, existing_ids)
|
452 |
-
|
453 |
-
# 3. Assign the new IDs and other columns in a vectorized way.
|
454 |
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
455 |
|
456 |
-
|
457 |
new_annotations_df = new_annotations_df.assign(
|
458 |
image=lambda df: df['page'].map(page_to_image_map),
|
459 |
label="Redaction",
|
460 |
-
color='(0, 0, 0)'
|
461 |
-
id=new_id_list # Assign the pre-generated list of unique IDs
|
462 |
).rename(columns={
|
463 |
'word_x0': 'xmin',
|
464 |
'word_y0': 'ymin',
|
@@ -466,6 +519,17 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
466 |
'word_y1': 'ymax',
|
467 |
'word_text': 'text'
|
468 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
469 |
|
470 |
annotation_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
471 |
new_annotations_df = new_annotations_df[annotation_cols]
|
@@ -477,7 +541,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
477 |
if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
|
478 |
unique_new_df = new_annotations_df
|
479 |
else:
|
480 |
-
# I'm not doing checks
|
481 |
# merged = pd.merge(
|
482 |
# new_annotations_df,
|
483 |
# existing_annotations_df[key_cols].drop_duplicates(),
|
@@ -508,16 +572,39 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
|
|
508 |
merged_df = pd.merge(all_pages_df[['image']], updated_annotations_df, on='image', how='left')
|
509 |
else:
|
510 |
merged_df = all_pages_df[['image']]
|
511 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
512 |
final_annotations_list = []
|
513 |
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
514 |
|
515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
516 |
if pd.isna(group.iloc[0].get('id')):
|
517 |
boxes = []
|
518 |
else:
|
519 |
valid_box_cols = [col for col in box_cols if col in group.columns]
|
520 |
-
boxes
|
|
|
|
|
521 |
|
522 |
final_annotations_list.append({
|
523 |
"image": image_path,
|
|
|
404 |
|
405 |
return list(newly_generated_ids)
|
406 |
|
407 |
+
def _merge_horizontally_adjacent_boxes(
|
408 |
+
df: pd.DataFrame,
|
409 |
+
x_merge_threshold: int = 0.02
|
410 |
+
) -> pd.DataFrame:
|
411 |
+
"""
|
412 |
+
Merges horizontally adjacent bounding boxes within the same line.
|
413 |
+
|
414 |
+
Args:
|
415 |
+
df (pd.DataFrame): DataFrame containing annotation boxes with columns
|
416 |
+
like 'page', 'line', 'xmin', 'xmax', etc.
|
417 |
+
x_merge_threshold (int): The maximum pixel gap on the x-axis to
|
418 |
+
consider two boxes as adjacent.
|
419 |
+
|
420 |
+
Returns:
|
421 |
+
pd.DataFrame: A new DataFrame with adjacent boxes merged.
|
422 |
+
"""
|
423 |
+
if df.empty:
|
424 |
+
return df
|
425 |
+
|
426 |
+
# 1. Sort values to ensure we are comparing adjacent boxes
|
427 |
+
df_sorted = df.sort_values(by=['page', 'line', 'xmin']).copy()
|
428 |
+
|
429 |
+
# 2. Identify groups of boxes to merge using shift() and cumsum()
|
430 |
+
# Get properties of the 'previous' box in the sorted list
|
431 |
+
prev_xmax = df_sorted['xmax'].shift(1)
|
432 |
+
prev_page = df_sorted['page'].shift(1)
|
433 |
+
prev_line = df_sorted['line'].shift(1)
|
434 |
+
|
435 |
+
# A box should be merged with the previous one if it's on the same page/line
|
436 |
+
# and the horizontal gap is within the threshold.
|
437 |
+
is_adjacent = (
|
438 |
+
(df_sorted['page'] == prev_page) &
|
439 |
+
(df_sorted['line'] == prev_line) &
|
440 |
+
(df_sorted['xmin'] - prev_xmax <= x_merge_threshold)
|
441 |
+
)
|
442 |
+
|
443 |
+
# A new group starts wherever a box is NOT adjacent to the previous one.
|
444 |
+
# cumsum() on this boolean series creates a unique ID for each group.
|
445 |
+
df_sorted['merge_group'] = (~is_adjacent).cumsum()
|
446 |
+
|
447 |
+
# 3. Aggregate each group into a single bounding box
|
448 |
+
# Define how to aggregate each column
|
449 |
+
agg_funcs = {
|
450 |
+
'xmin': 'min',
|
451 |
+
'ymin': 'min', # To get the highest point of the combined box
|
452 |
+
'xmax': 'max',
|
453 |
+
'ymax': 'max', # To get the lowest point of the combined box
|
454 |
+
'text': lambda s: ' '.join(s.astype(str)), # Join the text
|
455 |
+
# Carry over the first value for columns that are constant within a group
|
456 |
+
'page': 'first',
|
457 |
+
'line': 'first',
|
458 |
+
'image': 'first',
|
459 |
+
'label': 'first',
|
460 |
+
'color': 'first',
|
461 |
+
}
|
462 |
+
|
463 |
+
merged_df = df_sorted.groupby('merge_group').agg(agg_funcs).reset_index(drop=True)
|
464 |
+
|
465 |
+
print(f"Merged {len(df)} annotations into {len(merged_df)}.")
|
466 |
+
|
467 |
+
return merged_df
|
468 |
+
|
469 |
def create_annotation_objects_from_filtered_ocr_results_with_words(
|
470 |
filtered_ocr_results_with_words_df: pd.DataFrame,
|
471 |
ocr_results_with_words_df_base: pd.DataFrame,
|
|
|
473 |
existing_annotations_df: pd.DataFrame,
|
474 |
existing_annotations_list: List[Dict],
|
475 |
existing_recogniser_entity_df: pd.DataFrame,
|
476 |
+
progress = gr.Progress(track_tqdm=True)
|
477 |
) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
|
478 |
"""
|
479 |
+
This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
|
480 |
+
|
481 |
+
Args:
|
482 |
+
filtered_ocr_results_with_words_df (pd.DataFrame): A DataFrame containing filtered OCR results with words.
|
483 |
+
ocr_results_with_words_df_base (pd.DataFrame): The base DataFrame of OCR results with words.
|
484 |
+
page_sizes (List[Dict]): A list of dictionaries containing page sizes.
|
485 |
+
existing_annotations_df (pd.DataFrame): A DataFrame of existing annotations.
|
486 |
+
existing_annotations_list (List[Dict]): A list of dictionaries representing existing annotations.
|
487 |
+
existing_recogniser_entity_df (pd.DataFrame): A DataFrame of existing recogniser entities.
|
488 |
+
progress (gr.Progress, optional): A progress tracker. Defaults to gr.Progress(track_tqdm=True).
|
489 |
+
|
490 |
+
Returns:
|
491 |
+
Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
|
492 |
"""
|
493 |
|
494 |
+
progress(0.2, "Identifying new redactions to add")
|
495 |
print("Identifying new redactions to add")
|
|
|
496 |
if filtered_ocr_results_with_words_df.empty:
|
497 |
print("No new annotations to add.")
|
498 |
updated_annotations_df = existing_annotations_df.copy()
|
499 |
else:
|
500 |
+
# Assuming index relationship holds for fast lookup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
501 |
filtered_ocr_results_with_words_df.index = filtered_ocr_results_with_words_df["index"]
|
|
|
502 |
new_annotations_df = ocr_results_with_words_df_base.loc[filtered_ocr_results_with_words_df.index].copy()
|
503 |
|
504 |
if new_annotations_df.empty:
|
505 |
print("No new annotations to add.")
|
506 |
updated_annotations_df = existing_annotations_df.copy()
|
507 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
508 |
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
509 |
|
510 |
+
# Prepare the initial new annotations DataFrame
|
511 |
new_annotations_df = new_annotations_df.assign(
|
512 |
image=lambda df: df['page'].map(page_to_image_map),
|
513 |
label="Redaction",
|
514 |
+
color='(0, 0, 0)'
|
|
|
515 |
).rename(columns={
|
516 |
'word_x0': 'xmin',
|
517 |
'word_y0': 'ymin',
|
|
|
519 |
'word_y1': 'ymax',
|
520 |
'word_text': 'text'
|
521 |
})
|
522 |
+
|
523 |
+
progress(0.3, "Checking for adjacent annotations to merge...")
|
524 |
+
print("Checking for adjacent annotations to merge...")
|
525 |
+
new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
|
526 |
+
|
527 |
+
progress(0.4, "Creating new redaction IDs...")
|
528 |
+
print("Creating new redaction IDs...")
|
529 |
+
existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
|
530 |
+
num_new_ids = len(new_annotations_df)
|
531 |
+
new_id_list = _generate_unique_ids(num_new_ids, existing_ids)
|
532 |
+
new_annotations_df['id'] = new_id_list
|
533 |
|
534 |
annotation_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
535 |
new_annotations_df = new_annotations_df[annotation_cols]
|
|
|
541 |
if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
|
542 |
unique_new_df = new_annotations_df
|
543 |
else:
|
544 |
+
# I'm not doing checks against existing as it is too compute intensive in large documents
|
545 |
# merged = pd.merge(
|
546 |
# new_annotations_df,
|
547 |
# existing_annotations_df[key_cols].drop_duplicates(),
|
|
|
572 |
merged_df = pd.merge(all_pages_df[['image']], updated_annotations_df, on='image', how='left')
|
573 |
else:
|
574 |
merged_df = all_pages_df[['image']]
|
575 |
+
|
576 |
+
# 1. Get the list of image paths in the exact order they appear in page_sizes.
|
577 |
+
# all_pages_df was created from page_sizes, so it preserves this order.
|
578 |
+
image_order = all_pages_df['image'].tolist()
|
579 |
+
|
580 |
+
# 2. Convert the 'image' column to a special 'Categorical' type.
|
581 |
+
# This tells pandas that this column has a custom, non-alphabetical order.
|
582 |
+
merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
|
583 |
+
|
584 |
+
# 3. Sort the DataFrame based on this new custom order.
|
585 |
+
merged_df = merged_df.sort_values('image')
|
586 |
+
|
587 |
+
# --- NEW CODE END ---
|
588 |
+
|
589 |
final_annotations_list = []
|
590 |
box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
591 |
|
592 |
+
# Now, when we group, we use `sort=False`. This tells groupby to respect the
|
593 |
+
# DataFrame's current order, which we have just manually set. This is slightly
|
594 |
+
# more efficient than letting it sort again.
|
595 |
+
for image_path, group in merged_df.groupby('image', sort=False):
|
596 |
+
# The progress.tqdm wrapper can be added back around the groupby object as you had it.
|
597 |
+
# for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
|
598 |
+
|
599 |
+
# Check if the group has actual annotations. iloc[0] is safe because even pages
|
600 |
+
# without annotations will have one row with NaN values from the merge.
|
601 |
if pd.isna(group.iloc[0].get('id')):
|
602 |
boxes = []
|
603 |
else:
|
604 |
valid_box_cols = [col for col in box_cols if col in group.columns]
|
605 |
+
# We should also sort the boxes within a page for consistency (e.g., left-to-right)
|
606 |
+
sorted_group = group.sort_values(by=['ymin', 'xmin'])
|
607 |
+
boxes = sorted_group[valid_box_cols].to_dict('records')
|
608 |
|
609 |
final_annotations_list.append({
|
610 |
"image": image_path,
|