seanpedrickcase commited on
Commit
87e1451
·
1 Parent(s): ee6b7fb

Added a multi word search tool on the redaction review tool page

Browse files
app.py CHANGED
@@ -12,7 +12,7 @@ from tools.data_anonymise import anonymise_data_files
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
- from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
17
 
18
  # Suppress downcasting warnings
@@ -409,8 +409,15 @@ with app:
409
  with gr.Tab("Search and redact"):
410
  with gr.Accordion("Search text", open=True):
411
  with gr.Row(equal_height=True):
412
- page_entity_dropdown_redaction = gr.Dropdown(label="Page", value="1", allow_custom_value=True)
413
- reset_dropdowns_btn_new = gr.Button(value="Reset page filter")
 
 
 
 
 
 
 
414
 
415
  all_page_line_level_ocr_results_with_words_df = gr.Dataframe(pd.DataFrame(data={"page":[], "line":[], "word_text":[], "word_x0":[], "word_y0":[],"word_x1":[],"word_y1":[]}), type="pandas", label="Click table row to select and go to page", headers=["page", "line", "word_text", "word_x0","word_y0","word_x1","word_y1"], show_fullscreen_button=True, wrap=False, max_height=400, show_search="filter")
416
 
@@ -472,6 +479,7 @@ with app:
472
  with gr.Row():
473
  results_df_preview = gr.Dataframe(
474
  label="Similarity Results",
 
475
  wrap=True,
476
  show_fullscreen_button=True,
477
  show_search=True,
@@ -636,7 +644,7 @@ with app:
636
  success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox])
637
 
638
  # Run redaction function
639
- document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
640
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
641
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
642
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
@@ -690,7 +698,7 @@ with app:
690
  success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_page_line_level_ocr_results_df_base, all_page_line_level_ocr_results_with_words_df_base, latest_file_completed_num, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_page_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox, all_page_line_level_ocr_results_with_words_df_base], api_name="prepare_doc").\
691
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
692
 
693
- # Manual updates to review di
694
  review_file_df.input(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
695
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
696
 
@@ -782,6 +790,32 @@ with app:
782
  ###
783
  page_entity_dropdown_redaction.select(update_redact_choice_df_from_page_dropdown, inputs=[page_entity_dropdown_redaction, all_page_line_level_ocr_results_with_words_df_base], outputs=[all_page_line_level_ocr_results_with_words_df])
784
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
785
  # Clicking on a cell in the redact items table will take you to that page
786
  all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
787
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
@@ -871,7 +905,7 @@ with app:
871
  ],
872
  outputs=[
873
  results_df_preview,
874
- duplicate_files_out,
875
  full_duplicate_data_by_file
876
  ]
877
  )
@@ -896,6 +930,8 @@ with app:
896
  outputs=[review_file_df, all_image_annotations_state]).\
897
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
898
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
 
 
899
 
900
  ###
901
  # SETTINGS PAGE INPUT / OUTPUT
@@ -910,6 +946,7 @@ with app:
910
  in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
911
  in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
912
 
 
913
  apply_fully_redacted_list_btn.click(
914
  fn=apply_whole_page_redactions_from_list,
915
  inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
 
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
+ from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates, run_full_search_and_analysis
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
17
 
18
  # Suppress downcasting warnings
 
409
  with gr.Tab("Search and redact"):
410
  with gr.Accordion("Search text", open=True):
411
  with gr.Row(equal_height=True):
412
+ page_entity_dropdown_redaction = gr.Dropdown(label="Page", value="1", allow_custom_value=True, scale=4)
413
+ reset_dropdowns_btn_new = gr.Button(value="Reset page filter", scale=1)
414
+
415
+ with gr.Row():
416
+ multi_word_search_text = gr.Textbox(label="Multi-word text search", value="", scale=4)
417
+ multi_word_search_text_btn = gr.Button(value="Search", scale=1)
418
+
419
+ with gr.Accordion("Search options", open=False):
420
+ similarity_search_score_minimum = gr.Number(value=1.0, minimum=0.4, maximum=1.0, label="Minimum similarity score for match (max=1)")
421
 
422
  all_page_line_level_ocr_results_with_words_df = gr.Dataframe(pd.DataFrame(data={"page":[], "line":[], "word_text":[], "word_x0":[], "word_y0":[],"word_x1":[],"word_y1":[]}), type="pandas", label="Click table row to select and go to page", headers=["page", "line", "word_text", "word_x0","word_y0","word_x1","word_y1"], show_fullscreen_button=True, wrap=False, max_height=400, show_search="filter")
423
 
 
479
  with gr.Row():
480
  results_df_preview = gr.Dataframe(
481
  label="Similarity Results",
482
+ headers=["Page1_File", "Page1_Start_Page", "Page1_End_Page", "Page2_File", "Page2_Start_Page", "Page2_End_Page", "Match_Length", "Avg_Similarity", "Page1_Text", "Page2_Text"],
483
  wrap=True,
484
  show_fullscreen_button=True,
485
  show_search=True,
 
644
  success(fn=check_for_relevant_ocr_output_with_words, inputs=[doc_file_name_no_extension_textbox, text_extract_method_radio, output_folder_textbox], outputs=[relevant_ocr_output_with_words_found_checkbox])
645
 
646
  # Run redaction function
647
+ document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number, all_page_line_level_ocr_results_with_words]).\
648
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
649
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_num, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_df, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base],
650
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_num, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_page_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_df, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words, all_page_line_level_ocr_results_with_words_df_base, backup_review_state], api_name="redact_doc")
 
698
  success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_page_line_level_ocr_results_df_base, all_page_line_level_ocr_results_with_words_df_base, latest_file_completed_num, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_page_line_level_ocr_results_df_base, relevant_ocr_output_with_words_found_checkbox, all_page_line_level_ocr_results_with_words_df_base], api_name="prepare_doc").\
699
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
700
 
701
+ # Manual updates to review df
702
  review_file_df.input(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
703
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
704
 
 
790
  ###
791
  page_entity_dropdown_redaction.select(update_redact_choice_df_from_page_dropdown, inputs=[page_entity_dropdown_redaction, all_page_line_level_ocr_results_with_words_df_base], outputs=[all_page_line_level_ocr_results_with_words_df])
792
 
793
+ multi_word_search_text.submit(
794
+ fn=run_full_search_and_analysis,
795
+ inputs=[
796
+ multi_word_search_text,
797
+ all_page_line_level_ocr_results_with_words_df_base,
798
+ similarity_search_score_minimum
799
+ ],
800
+ outputs=[
801
+ all_page_line_level_ocr_results_with_words_df,
802
+ duplicate_files_out,
803
+ full_duplicate_data_by_file
804
+ ])
805
+
806
+ multi_word_search_text_btn.click(
807
+ fn=run_full_search_and_analysis,
808
+ inputs=[
809
+ multi_word_search_text,
810
+ all_page_line_level_ocr_results_with_words_df_base,
811
+ similarity_search_score_minimum
812
+ ],
813
+ outputs=[
814
+ all_page_line_level_ocr_results_with_words_df,
815
+ duplicate_files_out,
816
+ full_duplicate_data_by_file
817
+ ])
818
+
819
  # Clicking on a cell in the redact items table will take you to that page
820
  all_page_line_level_ocr_results_with_words_df.select(df_select_callback_dataframe_row_ocr_with_words, inputs=[all_page_line_level_ocr_results_with_words_df], outputs=[selected_entity_dataframe_row_redact, selected_entity_dataframe_row_text_redact]).\
821
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row_redact, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
 
905
  ],
906
  outputs=[
907
  results_df_preview,
908
+ duplicate_files_out,
909
  full_duplicate_data_by_file
910
  ]
911
  )
 
930
  outputs=[review_file_df, all_image_annotations_state]).\
931
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
932
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_entity_dropdown_redaction, page_sizes, all_image_annotations_state])
933
+
934
+
935
 
936
  ###
937
  # SETTINGS PAGE INPUT / OUTPUT
 
946
  in_deny_list_state.input(update_dataframe, inputs=[in_deny_list_state], outputs=[in_deny_list_state])
947
  in_fully_redacted_list_state.input(update_dataframe, inputs=[in_fully_redacted_list_state], outputs=[in_fully_redacted_list_state])
948
 
949
+ # Apply whole page redactions from the provided whole page redaction csv file upload/list of specific page numbers given by user
950
  apply_fully_redacted_list_btn.click(
951
  fn=apply_whole_page_redactions_from_list,
952
  inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
tools/file_redaction.py CHANGED
@@ -104,7 +104,7 @@ def choose_and_run_redactor(file_paths:List[str],
104
  page_min:int=0,
105
  page_max:int=999,
106
  estimated_time_taken_state:float=0.0,
107
- handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
108
  all_request_metadata_str:str = "",
109
  annotations_all_pages:List[dict]=list(),
110
  all_page_line_level_ocr_results_df:pd.DataFrame=None,
@@ -132,7 +132,7 @@ def choose_and_run_redactor(file_paths:List[str],
132
  ocr_file_path:str="",
133
  all_page_line_level_ocr_results:list[dict] = list(),
134
  all_page_line_level_ocr_results_with_words:list[dict] = list(),
135
- all_page_line_level_ocr_results_with_words_df:pd.DataFrame=list(),
136
  prepare_images:bool=True,
137
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
138
  progress=gr.Progress(track_tqdm=True)):
@@ -202,6 +202,11 @@ def choose_and_run_redactor(file_paths:List[str],
202
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
203
  review_out_file_paths = [prepared_pdf_file_paths[0]]
204
 
 
 
 
 
 
205
  # Create copies of out_file_path objects to avoid overwriting each other on append actions
206
  out_file_paths = out_file_paths.copy()
207
  log_files_output_paths = log_files_output_paths.copy()
@@ -663,6 +668,9 @@ def choose_and_run_redactor(file_paths:List[str],
663
  if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
664
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
665
 
 
 
 
666
  # Convert the gradio annotation boxes to relative coordinates
667
  progress(0.93, "Creating review file output")
668
  page_sizes = page_sizes_df.to_dict(orient="records")
@@ -1203,7 +1211,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
1203
  # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
1204
  ###
1205
 
1206
- def merge_img_bboxes(bboxes, combined_results: Dict, page_signature_recogniser_results=[], page_handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Extract handwriting", "Extract signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
1207
 
1208
  all_bboxes = []
1209
  merged_bboxes = []
@@ -1385,6 +1393,8 @@ def redact_image_pdf(file_path:str,
1385
 
1386
  tic = time.perf_counter()
1387
 
 
 
1388
  file_name = get_file_name_without_type(file_path)
1389
  comprehend_query_number_new = 0
1390
 
 
104
  page_min:int=0,
105
  page_max:int=999,
106
  estimated_time_taken_state:float=0.0,
107
+ handwrite_signature_checkbox:List[str]=list(["Extract handwriting", "Extract signatures"]),
108
  all_request_metadata_str:str = "",
109
  annotations_all_pages:List[dict]=list(),
110
  all_page_line_level_ocr_results_df:pd.DataFrame=None,
 
132
  ocr_file_path:str="",
133
  all_page_line_level_ocr_results:list[dict] = list(),
134
  all_page_line_level_ocr_results_with_words:list[dict] = list(),
135
+ all_page_line_level_ocr_results_with_words_df:pd.DataFrame=None,
136
  prepare_images:bool=True,
137
  RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
138
  progress=gr.Progress(track_tqdm=True)):
 
202
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
203
  review_out_file_paths = [prepared_pdf_file_paths[0]]
204
 
205
+ print("all_page_line_level_ocr_results_with_words at start of choose and run...:", all_page_line_level_ocr_results_with_words)
206
+
207
+ if all_page_line_level_ocr_results_with_words_df is None:
208
+ all_page_line_level_ocr_results_with_words_df = pd.DataFrame()
209
+
210
  # Create copies of out_file_path objects to avoid overwriting each other on append actions
211
  out_file_paths = out_file_paths.copy()
212
  log_files_output_paths = log_files_output_paths.copy()
 
668
  if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
669
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
670
 
671
+ if all_page_line_level_ocr_results_with_words_df_file_path not in out_file_paths:
672
+ out_file_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
673
+
674
  # Convert the gradio annotation boxes to relative coordinates
675
  progress(0.93, "Creating review file output")
676
  page_sizes = page_sizes_df.to_dict(orient="records")
 
1211
  # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
1212
  ###
1213
 
1214
+ def merge_img_bboxes(bboxes:list, combined_results: Dict, page_signature_recogniser_results=[], page_handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Extract handwriting", "Extract signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
1215
 
1216
  all_bboxes = []
1217
  merged_bboxes = []
 
1393
 
1394
  tic = time.perf_counter()
1395
 
1396
+ print("all_page_line_level_ocr_results_with_words in redact_image_pdf:", all_page_line_level_ocr_results_with_words)
1397
+
1398
  file_name = get_file_name_without_type(file_path)
1399
  comprehend_query_number_new = 0
1400
 
tools/find_duplicate_pages.py CHANGED
@@ -1,10 +1,12 @@
1
  import pandas as pd
2
  import os
3
  import re
 
 
4
  from tools.helper_functions import OUTPUT_FOLDER
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
- from typing import List, Tuple, Optional, Dict
8
  from collections import defaultdict
9
  import gradio as gr
10
  from gradio import Progress
@@ -16,76 +18,400 @@ import en_core_web_lg
16
  nlp = en_core_web_lg.load()
17
 
18
  similarity_threshold = 0.95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, output_folder:str=OUTPUT_FOLDER):
 
 
 
 
 
 
 
 
 
 
21
  """
22
- Combines text from multiple CSV files containing page and text columns.
23
- Groups text by file and page number, concatenating text within these groups.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- Args:
26
- input_files (list): List of paths to CSV files
 
 
 
 
 
 
 
 
 
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  Returns:
29
- pd.DataFrame: Combined dataframe with columns [file, page, text]
 
 
 
30
  """
31
- all_data = []
32
- output_files = []
 
 
33
 
34
- if isinstance(input_files, str):
35
- file_paths_list = [input_files]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  else:
37
- file_paths_list = input_files
38
-
39
- for file in file_paths_list:
 
 
 
40
 
41
- if isinstance(file, str):
42
- file_path = file
43
- else:
44
- file_path = file.name
45
 
46
- # Read CSV file
47
- df = pd.read_csv(file_path)
48
-
49
- # Ensure required columns exist
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  if 'page' not in df.columns or 'text' not in df.columns:
51
- print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
52
  continue
53
 
 
54
  df['text'] = df['text'].fillna('').astype(str)
55
-
56
- # Group by page and concatenate text
57
- if combine_pages == True:
58
- grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
59
  else:
60
- df['line_number_by_page'] = df.groupby('page').cumcount() + 1
61
- df['original_page'] = df['page']
62
- df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
63
- df['page'] = df['page'].astype(int)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- grouped = df #.drop('line_number_by_page', axis=1)
66
-
67
- # Add filename column
68
- grouped['file'] = os.path.basename(file_path)
69
-
70
- all_data.append(grouped)
71
-
72
  if not all_data:
73
- raise ValueError("No valid CSV files were processed")
74
-
75
- # Combine all dataframes
76
  combined_df = pd.concat(all_data, ignore_index=True)
 
 
 
 
 
77
 
78
- # Reorder columns
79
- combined_df = combined_df[['file', 'page', 'text']]
80
 
81
- output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
82
- combined_df.to_csv(output_combined_file_path, index=None)
83
 
84
- output_files.append(output_combined_file_path)
85
-
86
- return combined_df, output_files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
- def process_data(df:pd.DataFrame, column:str):
 
 
 
 
 
 
 
 
 
 
 
 
89
  '''
90
  Clean and stem text columns in a data frame
91
  '''
@@ -213,142 +539,312 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str,
213
 
214
  return output_paths
215
 
216
- def identify_similar_pages(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
  df_combined: pd.DataFrame,
218
- similarity_threshold: float = 0.9,
219
- min_word_count: int = 10,
220
  min_consecutive_pages: int = 1,
221
- greedy_match: bool = False,
222
- combine_pages:bool=True,
223
- output_folder: str = OUTPUT_FOLDER,
 
 
224
  progress=Progress(track_tqdm=True)
225
  ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
226
  """
227
- Identifies similar pages with three possible strategies:
228
- 1. Single Page: If greedy_match=False and min_consecutive_pages=1.
229
- 2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
230
- 3. Greedy Consecutive Match: If greedy_match=True.
231
  """
232
-
233
- output_paths = []
234
  progress(0.1, desc="Processing and filtering text")
235
- df = process_data(df_combined, 'text')
 
 
 
 
 
 
236
  df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
 
237
  original_row_count = len(df)
238
  df_filtered = df[df['word_count'] >= min_word_count].copy()
239
  df_filtered.reset_index(drop=True, inplace=True)
240
-
241
  print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
242
-
243
  if len(df_filtered) < 2:
244
  return pd.DataFrame(), [], df_combined
245
-
246
- vectorizer = TfidfVectorizer()
247
- tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
248
-
249
- progress(0.3, desc="Calculating text similarity")
250
- similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
251
- coo_matrix = similarity_matrix.tocoo()
252
-
253
- # Create a DataFrame of all individual page pairs above the threshold.
254
- # This is the base for all three matching strategies.
255
- similar_pages = [
256
- (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
257
- if r < c and v >= similarity_threshold
258
- ]
259
-
260
- if not similar_pages:
261
- return pd.DataFrame(), [], df_combined
262
-
263
- base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
264
 
265
- progress(0.6, desc="Aggregating results based on matching strategy")
266
 
267
- if greedy_match:
268
- print("Finding matches using greedy consecutive strategy.")
269
-
270
- # A set of pairs for fast lookups of (page1_idx, page2_idx)
271
- valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
272
-
273
- # Keep track of indices that have been used in a sequence
274
- consumed_indices_1 = set()
275
- consumed_indices_2 = set()
276
-
277
- all_sequences = []
278
-
279
- # Iterate through all potential starting pairs, sorted for consistent results
280
- sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
281
-
282
- for _, row in sorted_pairs.iterrows():
283
- start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
284
 
285
- # If this pair has already been consumed by a previous sequence, skip it
286
- if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
287
- continue
 
 
288
 
289
- # This is a new sequence, start expanding it
290
- current_sequence = [(start_idx1, start_idx2)]
291
- k = 1
292
- while True:
293
- next_idx1 = start_idx1 + k
294
- next_idx2 = start_idx2 + k
295
-
296
- # Check if the next pair in the sequence is a valid match
297
- if (next_idx1, next_idx2) in valid_pairs_set and \
298
- next_idx1 not in consumed_indices_1 and \
299
- next_idx2 not in consumed_indices_2:
300
- current_sequence.append((next_idx1, next_idx2))
301
- k += 1
302
- else:
303
- # The sequence has ended
304
- break
305
-
306
- # Record the found sequence and mark all its pages as consumed
307
- sequence_indices_1 = [p[0] for p in current_sequence]
308
- sequence_indices_2 = [p[1] for p in current_sequence]
309
-
310
- all_sequences.append({
311
- 'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
312
- 'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
313
- 'Match_Length': len(current_sequence)
314
- })
315
 
316
- consumed_indices_1.update(sequence_indices_1)
317
- consumed_indices_2.update(sequence_indices_2)
 
 
318
 
319
- if not all_sequences:
320
  return pd.DataFrame(), [], df_combined
 
 
 
 
321
 
322
- subdocument_df = pd.DataFrame(all_sequences)
 
 
 
 
323
 
324
- elif min_consecutive_pages > 1:
325
- # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
326
- print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
327
- similarity_df = base_similarity_df.copy()
328
- similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
329
  is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
 
 
 
330
  block_id = is_consecutive.eq(False).cumsum()
 
 
331
  grouped = similarity_df.groupby(block_id)
 
 
332
  agg_results = grouped.agg(
333
- Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
334
- Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
335
- Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
 
 
 
336
  ).reset_index(drop=True)
337
- subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
338
- if subdocument_df.empty: return pd.DataFrame(), [], df_combined
339
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  else:
341
- # --- STRATEGY 1: Single Page Matching ---
342
- print(f"Finding single page matches (min_consecutive_pages=1)")
343
  final_df = map_metadata_single_page(base_similarity_df, df_filtered)
344
- # The rest of the logic (saving files) is handled after this if/else block
345
- pass # The final_df is already prepared
 
 
 
346
 
347
- # --- Map metadata and format output ---
348
- # This block now handles the output for both subdocument strategies (2 and 3)
349
- if greedy_match or min_consecutive_pages > 1:
350
- final_df = map_metadata_subdocument(subdocument_df, df_filtered)
351
-
352
  progress(0.8, desc="Saving output files")
353
 
354
  output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
@@ -405,18 +901,16 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
405
  Wrapper function updated to include the 'greedy_match' boolean.
406
  """
407
  if not files:
408
- gr.Warning("Please upload files to analyze.")
409
- return None, None, None
410
 
411
  progress(0, desc="Combining input files...")
412
- df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages)
413
 
414
  if df_combined.empty:
415
- gr.Warning("No data found in the uploaded files.")
416
- return None, None, None
417
 
418
  # Call the main analysis function with the new parameter
419
- results_df, output_paths, full_df = identify_similar_pages(
420
  df_combined=df_combined,
421
  similarity_threshold=threshold,
422
  min_word_count=min_words,
@@ -428,7 +922,6 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
428
 
429
  # Clip text to first 200 characters
430
  full_df['text'] = full_df['text'].str[:preview_length]
431
-
432
  # Preprocess full_data (without preview text) for fast access (run once)
433
  full_data_by_file = {
434
  file: df.sort_values('page').set_index('page')
@@ -438,7 +931,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
438
  if results_df.empty:
439
  gr.Info(f"No duplicate pages found, no results returned.")
440
 
441
- return results_df, output_paths, full_data_by_file # full_df,
442
 
443
  def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
444
  """
@@ -523,10 +1016,29 @@ def add_new_annotations_to_existing_page_annotations(
523
 
524
  return all_annotations, newly_added_annotation_group
525
 
526
- def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=[]):
527
  '''
528
- Take a list of suggested whole pages to redact and apply it to review file data.
 
 
 
 
 
 
 
 
 
 
 
529
  '''
 
 
 
 
 
 
 
 
530
  all_annotations = all_existing_annotations.copy()
531
 
532
  if not pymupdf_doc:
@@ -659,34 +1171,27 @@ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFram
659
  review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
660
  review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
661
 
662
- out_message = "Successfully created whole page redactions."
663
  print(out_message)
664
  gr.Info(out_message)
665
 
666
  return review_file_out, all_annotations
667
 
668
- # --- 1. Helper Function to Parse the Combined Page/Line ID ---
669
  def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
670
- """
671
- Parses a combined page and line number ID into a (page, line) tuple.
672
- Assumes the ID is a 10-digit number where the first 5 are the page
673
- and the last 5 are the line number.
674
-
675
- Example: 100027 -> (1, 27)
676
- 200005 -> (2, 5)
677
- """
678
- # zfill ensures the string is padded with leading zeros to 10 characters
679
- s_id = str(combined_id).zfill(10)
680
- page = int(s_id[:5])
681
- line = int(s_id[5:])
682
  return page, line
683
 
684
  def create_annotation_objects_from_duplicates(
685
  duplicates_df: pd.DataFrame,
686
  ocr_results_df: pd.DataFrame,
687
  page_sizes: List[Dict],
688
- combine_pages:bool=False
689
- ) -> List[Dict]:
690
  """
691
  Creates structured annotation objects from duplicate line ranges, mapping
692
  page numbers to image paths.
@@ -702,8 +1207,12 @@ def create_annotation_objects_from_duplicates(
702
  """
703
  final_output = []
704
 
 
 
 
 
 
705
  if combine_pages == False:
706
- # --- NEW: Create an efficient lookup map from page number to image path ---
707
  page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
708
 
709
  # Prepare OCR Data: Add a line number column if it doesn't exist
 
1
  import pandas as pd
2
  import os
3
  import re
4
+ import itertools # For getting file pairs
5
+ import numpy as np # For efficient index finding
6
  from tools.helper_functions import OUTPUT_FOLDER
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
+ from typing import List, Tuple, Optional, Dict, Union
10
  from collections import defaultdict
11
  import gradio as gr
12
  from gradio import Progress
 
18
  nlp = en_core_web_lg.load()
19
 
20
  similarity_threshold = 0.95
21
+ number_of_zeros_to_add_to_index = 7 # Number of zeroes to add between page number and line numbers to get a unique page/line index value
22
+ ID_MULTIPLIER = 100000
23
+
24
+ def extract_indices_from_page_ranges(
25
+ results_df: pd.DataFrame,
26
+ start_col: str = 'Page2_Start_Page',
27
+ end_col: str = 'Page2_End_Page',
28
+ modulo_divisor_number_of_zeros: int = number_of_zeros_to_add_to_index, # Search for number of added
29
+ converted_index: bool = False # Has the index been converted to the page_no + 0000 + line number format that needs the modulo divisor to convert back?
30
+ ) -> List[int]:
31
+ all_indices = set()
32
+ modulo_divisor = int("1" + modulo_divisor_number_of_zeros*"0")
33
+
34
+ for _, row in results_df.iterrows():
35
+ start_page = row[start_col]
36
+ end_page = row[end_col]
37
+ for encoded_page_id in range(start_page, end_page + 1):
38
+ if converted_index == True:
39
+ original_page, original_index = _parse_page_line_id(encoded_page_id)#(encoded_page_id % modulo_divisor) - 1
40
+ else:
41
+ original_index = encoded_page_id
42
+
43
+ all_indices.add(original_index)
44
+ return sorted(list(all_indices))
45
 
46
+ def run_full_search_and_analysis(
47
+ search_query_text: str,
48
+ word_level_df_orig: pd.DataFrame,
49
+ similarity_threshold: float = 1,
50
+ combine_pages: bool = False,
51
+ min_word_count: int = 1,
52
+ min_consecutive_pages: int = 1,
53
+ greedy_match: bool = True,
54
+ remake_index: bool = False,
55
+ progress=gr.Progress(track_tqdm=True)
56
+ ):
57
  """
58
+ This function orchestrates the entire pipeline for finding duplicate pages based on a user's search query. It takes in the search query text, the original word-level OCR data, and various parameters to control the analysis. The function then:
59
+
60
+ 1. Converts the user's search query into a DataFrame format suitable for analysis.
61
+ 2. Prepares the main word-level OCR data for processing by converting it into the required format.
62
+ 3. Combines the search query DataFrame with the prepared OCR data DataFrame.
63
+ 4. Executes the similarity analysis on the combined data using the specified parameters such as similarity threshold, minimum word count, minimum consecutive pages, and greedy match strategy.
64
+
65
+ Parameters:
66
+ - search_query_text (str): The text entered by the user to search for in the OCR data.
67
+ - word_level_df_orig (pd.DataFrame): The original DataFrame containing word-level OCR data.
68
+ - similarity_threshold (float, optional): The minimum similarity score required for two pages to be considered duplicates. Defaults to 1.
69
+ - combine_pages (bool, optional): A flag indicating whether to combine text from the same page number within a file. Defaults to False.
70
+ - min_word_count (int, optional): The minimum number of words required for a page to be considered in the analysis. Defaults to 1.
71
+ - min_consecutive_pages (int, optional): The minimum number of consecutive pages required to be considered a match. Defaults to 1.
72
+ - greedy_match (bool, optional): A flag indicating whether to use a greedy strategy for matching consecutive pages. Defaults to True.
73
+ - remake_index (bool, optional): A flag indicating whether to remake the index of the DataFrame during processing. Defaults to False.
74
+ - progress (gr.Progress, optional): A Progress object to track the progress of the operation. Defaults to a Progress object with track_tqdm set to True.
75
+ """
76
+
77
+ if len(search_query_text) < 3:
78
+ raise Warning("Please use a search query with at least three letters.")
79
+ if len(search_query_text) > 100:
80
+ raise Warning("Please use a search query with at less than 100 characters.")
81
+
82
+ # Step 1: Process the user's search query string
83
+ search_query_data, query_word_length = create_dataframe_from_string(search_query_text, split_words=True)
84
+ if not search_query_data:
85
+ # Handle case where user submits an empty search string
86
+ raise Warning("Could not convert search string to required format")
87
+
88
+ if query_word_length > 10:
89
+ # Handle case where user submits an empty search string
90
+ raise Warning("Please use a query with less than 10 words")
91
+
92
+ # Overwrite min_consecutive_pages with the search string length
93
+ min_consecutive_pages = query_word_length
94
 
95
+ # Create word index from reference table
96
+ word_level_df_orig["index"] = word_level_df_orig.index
97
+ word_level_df = word_level_df_orig.copy()
98
+
99
+ # Step 2: Process the main word-level OCR DataFrame
100
+ word_level_data = convert_word_level_df(word_level_df, file_name="source_document")
101
+
102
+ # Step 3: Combine both data sources into one list
103
+ all_data_to_process = search_query_data + word_level_data
104
+ if not all_data_to_process:
105
+ raise gr.Error("No data to process. Please check your inputs.")
106
 
107
+ # Step 4: Run the combination logic
108
+ combined_df, _, full_out_ocr_df = combine_ocr_dataframes(
109
+ input_data=all_data_to_process,
110
+ combine_pages=combine_pages,
111
+ output_folder=None, # No need to save this intermediate file
112
+ remake_index=remake_index
113
+ )
114
+
115
+ # Step 5: Run the final similarity analysis on the combined data
116
+ results_df, duplicate_files, full_data = identify_similar_text_sequences(
117
+ df_combined=combined_df,
118
+ similarity_threshold=similarity_threshold,
119
+ min_word_count=min_word_count,
120
+ min_consecutive_pages=min_consecutive_pages,
121
+ greedy_match=greedy_match,
122
+ combine_pages=combine_pages,
123
+ inter_file_only=True,
124
+ do_text_clean=False,
125
+ progress=progress
126
+ )
127
+
128
+ print("Finished text search")
129
+
130
+ # Map the results back to the reference data file
131
+ if remake_index == True:
132
+ results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=True)
133
+ else:
134
+ results_df_index_list = extract_indices_from_page_ranges(results_df, converted_index=False)
135
+
136
+ word_level_df_out = word_level_df_orig.loc[word_level_df_orig["index"].isin(results_df_index_list)]
137
+
138
+ return word_level_df_out, duplicate_files, full_data
139
+
140
+ def create_all_data_to_process(converted_data:pd.DataFrame, other_data_list:List[Tuple]):
141
+ all_data_to_process = converted_data + other_data_list
142
+ return all_data_to_process
143
+
144
+ def convert_word_level_df(
145
+ word_level_df: pd.DataFrame,
146
+ file_name: str = "converted_dataframe"
147
+ ) -> List[Tuple[str, pd.DataFrame]]:
148
+ """
149
+ Converts a word-level OCR DataFrame to the format for
150
+ combine_ocr_dataframes.
151
+
152
+ A simple renaming and selection of relevant columns
153
+
154
+ Args:
155
+ word_level_df (pd.DataFrame):
156
+ A DataFrame containing detailed OCR output. Must include at least
157
+ the columns: 'page', 'line', and 'word_text'.
158
+ file_name (str, optional):
159
+ A unique identifier or "dummy" filename to assign to the resulting
160
+ data. Defaults to "converted_dataframe".
161
+
162
  Returns:
163
+ List[Tuple[str, pd.DataFrame]]:
164
+ A list containing a single tuple of (file_name, DataFrame), ready
165
+ to be used as input for the combine_ocr_dataframes function. The
166
+ DataFrame will have 'page' and 'text' columns.
167
  """
168
+ # --- 1. Validate Input ---
169
+ required_columns = ['page', 'line', 'word_text']
170
+ if not all(col in word_level_df.columns for col in required_columns):
171
+ raise ValueError(f"Input DataFrame must contain all of the following columns: {required_columns}")
172
 
173
+ df = word_level_df.copy()
174
+
175
+ # --- 2. Process the DataFrame ---
176
+ # Ensure word_text is a string to allow for joining
177
+ df['word_text'] = df['word_text'].astype(str)
178
+
179
+ # Group by page and line number, then join the words with a space (not needed for word level search)
180
+ # The result is a Series with a MultiIndex (page, line)
181
+ #line_text_series = df.groupby(['page', 'line'])['word_text'].apply(' '.join)
182
+
183
+ # Convert the Series back to a DataFrame and reset the index
184
+ #line_level_df = line_text_series.reset_index()
185
+
186
+ # Rename the aggregated column from 'word_text' to the required 'text'
187
+ df = df.rename(columns={'word_text': 'text'})
188
+
189
+ # --- 3. Finalise the structure ---
190
+ # We now have a DataFrame with columns [page, line, text].
191
+ final_df = df[['page', 'text']]
192
+
193
+ # --- 4. Package for output ---
194
+ # Return in the required List[Tuple[str, DataFrame]] format
195
+ return [(file_name, final_df)]
196
+
197
+ def create_dataframe_from_string(
198
+ text_string: str,
199
+ file_name: str = "user_search_query",
200
+ page_number: int = 1,
201
+ split_words: bool = False
202
+ ) -> Tuple[List[Tuple[str, pd.DataFrame]], int]:
203
+ """
204
+ Converts a string into a DataFrame compatible with combine_ocr_dataframes.
205
+
206
+ Can operate in two modes:
207
+ 1. As a single-line document (default).
208
+ 2. As a multi-line document where each word from the string is a separate line.
209
+
210
+ Args:
211
+ text_string (str): The input text to be placed in the DataFrame.
212
+ file_name (str, optional): A dummy filename to assign to this text.
213
+ Defaults to "user_search_query".
214
+ page_number (int, optional): A dummy page number to assign. Defaults to 1.
215
+ split_words (bool, optional): If True, splits the input string by
216
+ whitespace and creates a row for each word.
217
+ If False (default), the entire string is
218
+ treated as a single text entry.
219
+
220
+ Returns:
221
+ Tuple[List[Tuple[str, pd.DataFrame]], int]:
222
+ A list containing a single tuple: (file_name, DataFrame).
223
+ The DataFrame has 'page' and 'text' columns. Also, an integer value indicating the number of words in the search string.
224
+ Returns an empty list if the input string is empty or whitespace.
225
+ """
226
+ # Handle empty input gracefully, this works for both modes.
227
+ if not text_string or not text_string.strip():
228
+ print("Warning: Input string is empty. Returning an empty list.")
229
+ return [], 0
230
+
231
+ if split_words:
232
+ # --- MODE 2: Split string into words, one per row ---
233
+ words = text_string.split()
234
+ len_words = len(words)
235
+ data = {
236
+ # Assign the same page number to every word
237
+ 'page': [page_number] * len(words),
238
+ # The list of words becomes the text column
239
+ 'text': words
240
+ }
241
  else:
242
+ # --- MODE 1: Original behavior, entire string in one row ---
243
+ len_words = 1
244
+ data = {
245
+ 'page': [page_number],
246
+ 'text': [text_string]
247
+ }
248
 
249
+ # Create the DataFrame from the prepared data
250
+ df = pd.DataFrame(data)
 
 
251
 
252
+ df["line"] = df.index + 1
253
+
254
+ # Return it in the required format: a list containing one (name, df) tuple
255
+ return [(file_name, df)], len_words
256
+
257
+ def combine_ocr_dataframes(
258
+ input_data: List[Tuple[str, pd.DataFrame]],
259
+ combine_pages: bool = True,
260
+ output_folder: str = OUTPUT_FOLDER,
261
+ output_filename: str = "combined_ocr_output.csv",
262
+ number_of_added_zeros: int = number_of_zeros_to_add_to_index,
263
+ remake_index:bool = True
264
+ ) -> Tuple[pd.DataFrame, List[str]]:
265
+ """
266
+ Combines text from multiple pandas DataFrames containing page and text columns.
267
+
268
+ This function takes a list of (name, DataFrame) tuples, processes each DataFrame
269
+ by grouping and concatenating text, and then combines them into a single DataFrame.
270
+
271
+ Args:
272
+ input_data (List[Tuple[str, pd.DataFrame]]):
273
+ A list of tuples, where each tuple contains a unique identifier (like a filename)
274
+ and a pandas DataFrame. Each DataFrame must have 'page' and 'text' columns.
275
+ combine_pages (bool, optional):
276
+ If True, text from the same page number within a file is joined into a
277
+ single row. If False, each line of text gets its own row with a unique
278
+ page identifier. Defaults to True.
279
+ output_folder (str, optional):
280
+ The folder where the combined CSV file will be saved. Defaults to OUTPUT_FOLDER.
281
+ output_filename (str, optional):
282
+ The name of the output CSV file. Defaults to "combined_ocr_output.csv".
283
+
284
+ Returns:
285
+ Tuple[pd.DataFrame, List[str]]:
286
+ A tuple containing:
287
+ - The final combined and processed DataFrame.
288
+ - A list containing the path to the saved output CSV file.
289
+ """
290
+ all_data = []
291
+
292
+ for file_identifier, df_initial in input_data:
293
+ df = df_initial.copy() # Work on a copy to avoid side effects
294
+
295
+ # --- Validation ---
296
  if 'page' not in df.columns or 'text' not in df.columns:
297
+ print(f"Warning: Skipping data for '{file_identifier}' - missing required columns 'page' and 'text'.")
298
  continue
299
 
300
+ # --- Processing ---
301
  df['text'] = df['text'].fillna('').astype(str)
302
+
303
+ if combine_pages:
304
+ # Group by page and concatenate text into a single string
305
+ processed_df = df.groupby('page')['text'].apply(' '.join).reset_index()
306
  else:
307
+ if remake_index == True:
308
+ # # Create a unique, sortable page ID for each line without combining
309
+ # df['line_number_by_page'] = df.groupby('page').cumcount() + 1
310
+ # df['original_page'] = df['page']
311
+ # # Create a new page ID that combines page and line number for uniqueness
312
+ # df['page'] = (
313
+ # df['page'].astype(str).str.zfill(number_of_added_zeros) +
314
+ # df['line_number_by_page'].astype(str).str.zfill(number_of_added_zeros)
315
+ # ).astype(int)
316
+
317
+ # Define the multiplier based on the max expected lines per page.
318
+ # If you expect up to 99,999 lines, use 100,000.
319
+
320
+ df['line_number_by_page'] = df.groupby('page').cumcount() + 1
321
+ df['original_page'] = df['page']
322
+
323
+ # Create the new combined ID using arithmetic
324
+ df['page'] = (df['original_page'] * ID_MULTIPLIER) + df['line_number_by_page']
325
+
326
+ else:
327
+ if not 'index' in df.columns:
328
+ df['index'] = df.index
329
+ df['page'] = df['index']
330
+
331
+ processed_df = df
332
+
333
+ # Add the file identifier column
334
+ processed_df['file'] = file_identifier
335
+ all_data.append(processed_df)
336
 
 
 
 
 
 
 
 
337
  if not all_data:
338
+ raise ValueError("No valid DataFrames were processed. Ensure input data is not empty and DataFrames have 'page' and 'text' columns.")
339
+
340
+ # --- Final Combination ---
341
  combined_df = pd.concat(all_data, ignore_index=True)
342
+
343
+ # Reorder columns to a standard format, dropping intermediate columns
344
+ final_columns = ['file', 'page', 'text']
345
+ if 'original_page' in combined_df.columns:
346
+ final_columns.append('original_page') # Keep for context if created
347
 
348
+ # Ensure all final columns exist before trying to select them
349
+ existing_final_columns = [col for col in final_columns if col in combined_df.columns]
350
 
351
+ full_out_ocr_df = combined_df
352
+ combined_df = combined_df.copy()[existing_final_columns]
353
 
354
+ # --- Save Output ---
355
+ output_files = []
356
+ if output_folder and output_filename:
357
+ os.makedirs(output_folder, exist_ok=True)
358
+ output_path = os.path.join(output_folder, output_filename)
359
+ combined_df.to_csv(output_path, index=False)
360
+ output_files.append(output_path)
361
+ print(f"Successfully combined data and saved to: {output_path}")
362
+
363
+ return combined_df, output_files, full_out_ocr_df
364
+
365
+ def combine_ocr_output_text(
366
+ input_files: Union[str, List[str]],
367
+ combine_pages: bool = True,
368
+ remake_index: bool = True,
369
+ output_folder: str = OUTPUT_FOLDER
370
+ ) -> Tuple[pd.DataFrame, List[str]]:
371
+ """
372
+ Reads multiple OCR CSV files, combines them, and saves the result.
373
+
374
+ This function serves as a wrapper that reads CSV files from paths and then
375
+ uses the `combine_ocr_dataframes` function to perform the combination logic.
376
+
377
+ Args:
378
+ input_files (Union[str, List[str]]): A single file path or a list of file paths.
379
+ combine_pages (bool, optional): See `combine_ocr_dataframes`. Defaults to True.
380
+ output_folder (str, optional): See `combine_ocr_dataframes`. Defaults to OUTPUT_FOLDER.
381
+
382
+ Returns:
383
+ Tuple[pd.DataFrame, List[str]]: The combined DataFrame and the path to the output file.
384
+ """
385
+ if isinstance(input_files, str):
386
+ file_paths_list = [input_files]
387
+ else:
388
+ file_paths_list = input_files
389
+
390
+ data_to_process = []
391
+ for file_path in file_paths_list:
392
+ try:
393
+ df = pd.read_csv(file_path)
394
+ # Use the base filename as the identifier
395
+ file_identifier = os.path.basename(file_path)
396
+ data_to_process.append((file_identifier, df))
397
+ except FileNotFoundError:
398
+ print(f"Warning: File not found, skipping: {file_path}")
399
+ except Exception as e:
400
+ print(f"Warning: Failed to read or process {file_path}. Error: {e}")
401
 
402
+ if not data_to_process:
403
+ raise ValueError("No valid CSV files could be read or processed.")
404
+
405
+ # Call the core function with the loaded data
406
+ return combine_ocr_dataframes(
407
+ input_data=data_to_process,
408
+ combine_pages=combine_pages,
409
+ output_folder=output_folder,
410
+ output_filename="combined_ocr_from_files.csv", # Specific name for this path
411
+ remake_index=remake_index
412
+ )
413
+
414
+ def clean_and_stem_text_series(df:pd.DataFrame, column:str):
415
  '''
416
  Clean and stem text columns in a data frame
417
  '''
 
539
 
540
  return output_paths
541
 
542
+ # def identify_similar_text_sequences(
543
+ # df_combined: pd.DataFrame,
544
+ # similarity_threshold: float = 0.9,
545
+ # min_word_count: int = 10,
546
+ # min_consecutive_pages: int = 1,
547
+ # greedy_match: bool = False,
548
+ # combine_pages:bool=True,
549
+ # output_folder: str = OUTPUT_FOLDER,
550
+ # progress=Progress(track_tqdm=True)
551
+ # ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
552
+ # """
553
+ # Identifies similar pages with three possible strategies:
554
+ # 1. Single Page: If greedy_match=False and min_consecutive_pages=1.
555
+ # 2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
556
+ # 3. Greedy Consecutive Match: If greedy_match=True.
557
+ # """
558
+
559
+ # output_paths = []
560
+ # progress(0.1, desc="Processing and filtering text")
561
+ # df = clean_and_stem_text_series(df_combined, 'text')
562
+ # df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
563
+ # original_row_count = len(df)
564
+ # df_filtered = df[df['word_count'] >= min_word_count].copy()
565
+ # df_filtered.reset_index(drop=True, inplace=True)
566
+
567
+ # print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
568
+
569
+ # if len(df_filtered) < 2:
570
+ # return pd.DataFrame(), [], df_combined
571
+
572
+ # vectorizer = TfidfVectorizer()
573
+ # tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
574
+
575
+ # progress(0.3, desc="Calculating text similarity")
576
+ # similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
577
+ # coo_matrix = similarity_matrix.tocoo()
578
+
579
+ # # Create a DataFrame of all individual page pairs above the threshold.
580
+ # # This is the base for all three matching strategies.
581
+ # similar_pages = [
582
+ # (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
583
+ # if r < c and v >= similarity_threshold
584
+ # ]
585
+
586
+ # if not similar_pages:
587
+ # return pd.DataFrame(), [], df_combined
588
+
589
+ # base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
590
+
591
+ # progress(0.6, desc="Aggregating results based on matching strategy")
592
+
593
+ # if greedy_match:
594
+ # print("Finding matches using greedy consecutive strategy.")
595
+
596
+ # # A set of pairs for fast lookups of (page1_idx, page2_idx)
597
+ # valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
598
+
599
+ # # Keep track of indices that have been used in a sequence
600
+ # consumed_indices_1 = set()
601
+ # consumed_indices_2 = set()
602
+
603
+ # all_sequences = []
604
+
605
+ # # Iterate through all potential starting pairs, sorted for consistent results
606
+ # sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
607
+
608
+ # for _, row in sorted_pairs.iterrows():
609
+ # start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
610
+
611
+ # # If this pair has already been consumed by a previous sequence, skip it
612
+ # if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
613
+ # continue
614
+
615
+ # # This is a new sequence, start expanding it
616
+ # current_sequence = [(start_idx1, start_idx2)]
617
+ # k = 1
618
+ # while True:
619
+ # next_idx1 = start_idx1 + k
620
+ # next_idx2 = start_idx2 + k
621
+
622
+ # # Check if the next pair in the sequence is a valid match
623
+ # if (next_idx1, next_idx2) in valid_pairs_set and \
624
+ # next_idx1 not in consumed_indices_1 and \
625
+ # next_idx2 not in consumed_indices_2:
626
+ # current_sequence.append((next_idx1, next_idx2))
627
+ # k += 1
628
+ # else:
629
+ # # The sequence has ended
630
+ # break
631
+
632
+ # # Record the found sequence and mark all its pages as consumed
633
+ # sequence_indices_1 = [p[0] for p in current_sequence]
634
+ # sequence_indices_2 = [p[1] for p in current_sequence]
635
+
636
+ # all_sequences.append({
637
+ # 'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
638
+ # 'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
639
+ # 'Match_Length': len(current_sequence)
640
+ # })
641
+
642
+ # consumed_indices_1.update(sequence_indices_1)
643
+ # consumed_indices_2.update(sequence_indices_2)
644
+
645
+ # if not all_sequences:
646
+ # return pd.DataFrame(), [], df_combined
647
+
648
+ # subdocument_df = pd.DataFrame(all_sequences)
649
+
650
+ # elif min_consecutive_pages > 1:
651
+ # # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
652
+ # print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
653
+ # similarity_df = base_similarity_df.copy()
654
+ # similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
655
+ # is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
656
+ # block_id = is_consecutive.eq(False).cumsum()
657
+ # grouped = similarity_df.groupby(block_id)
658
+ # agg_results = grouped.agg(
659
+ # Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
660
+ # Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
661
+ # Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
662
+ # ).reset_index(drop=True)
663
+ # subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
664
+ # if subdocument_df.empty: return pd.DataFrame(), [], df_combined
665
+
666
+ # else:
667
+ # # --- STRATEGY 1: Single Page Matching ---
668
+ # print(f"Finding single page matches (min_consecutive_pages=1)")
669
+ # final_df = map_metadata_single_page(base_similarity_df, df_filtered)
670
+ # # The rest of the logic (saving files) is handled after this if/else block
671
+ # pass # The final_df is already prepared
672
+
673
+ # # --- Map metadata and format output ---
674
+ # # This block now handles the output for both subdocument strategies (2 and 3)
675
+ # if greedy_match or min_consecutive_pages > 1:
676
+ # final_df = map_metadata_subdocument(subdocument_df, df_filtered)
677
+
678
+ # progress(0.8, desc="Saving output files")
679
+
680
+ # output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
681
+
682
+ # return final_df, output_paths, df_combined
683
+
684
+ def _calculate_inter_file_similarity(df_filtered:pd.DataFrame, vectorizer, similarity_threshold:float, progress:gr.Progress):
685
+ """
686
+ Helper function to efficiently calculate similarity ONLY between different files.
687
+ """
688
+ print("Calculating inter-file similarity.")
689
+
690
+ # Step 1: Fit the vectorizer on the ENTIRE corpus to create a shared vocabulary.
691
+ # This is crucial for comparing vectors from different files meaningfully.
692
+ progress(0.2, desc="Building vocabulary...")
693
+ vectorizer.fit(df_filtered['text_clean'])
694
+
695
+ # Step 2: Group the DataFrame by file.
696
+ file_groups = df_filtered.groupby('file')
697
+ unique_files = list(file_groups.groups.keys())
698
+ all_similar_pairs = []
699
+
700
+ # Step 3: Iterate through all unique pairs of files.
701
+ file_combinations = list(itertools.combinations(unique_files, 2))
702
+
703
+ for i, (file1_name, file2_name) in enumerate(progress.tqdm(file_combinations, desc="Comparing file pairs")):
704
+ group1_df = file_groups.get_group(file1_name)
705
+ group2_df = file_groups.get_group(file2_name)
706
+
707
+ # Step 4: Use the pre-fitted vectorizer to TRANSFORM (not fit_transform) the text of each group.
708
+ tfidf1 = vectorizer.transform(group1_df['text_clean'])
709
+ tfidf2 = vectorizer.transform(group2_df['text_clean'])
710
+
711
+ # Step 5: Calculate similarity between the two groups.
712
+ # This is a much smaller matrix (pages_in_file1 x pages_in_file2).
713
+ similarity_matrix_subset = cosine_similarity(tfidf1, tfidf2)
714
+
715
+ # Step 6: Find pairs in this sub-matrix that exceed the threshold.
716
+ # `np.where` is highly efficient for this.
717
+ page1_indices, page2_indices = np.where(similarity_matrix_subset >= similarity_threshold)
718
+
719
+ # Step 7: Map the local indices back to the original global indices from `df_filtered`.
720
+ for p1_local_idx, p2_local_idx in zip(page1_indices, page2_indices):
721
+ global_idx1 = group1_df.index[p1_local_idx]
722
+ global_idx2 = group2_df.index[p2_local_idx]
723
+ score = similarity_matrix_subset[p1_local_idx, p2_local_idx]
724
+
725
+ # Ensure r < c convention to match the original method
726
+ r, c = min(global_idx1, global_idx2), max(global_idx1, global_idx2)
727
+ all_similar_pairs.append((r, c, score))
728
+
729
+ if not all_similar_pairs:
730
+ return pd.DataFrame()
731
+
732
+ # Create the final DataFrame, which is now pre-filtered.
733
+ return pd.DataFrame(all_similar_pairs, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
734
+
735
+
736
+ def identify_similar_text_sequences(
737
  df_combined: pd.DataFrame,
738
+ similarity_threshold: float = 1,
739
+ min_word_count: int = 1,
740
  min_consecutive_pages: int = 1,
741
+ greedy_match: bool = True,
742
+ combine_pages: bool = False,
743
+ inter_file_only: bool = False,
744
+ do_text_clean:bool = True,
745
+ output_folder: str = "output/",
746
  progress=Progress(track_tqdm=True)
747
  ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
748
  """
749
+ Identifies similar pages. Uses a highly optimized path for inter_file_only=True.
 
 
 
750
  """
 
 
751
  progress(0.1, desc="Processing and filtering text")
752
+
753
+ if do_text_clean:
754
+ df = clean_and_stem_text_series(df_combined, 'text') # Will produce the column 'text_clean'
755
+ else:
756
+ df = df_combined.copy()
757
+ df['text_clean'] = df['text'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
758
+
759
  df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
760
+
761
  original_row_count = len(df)
762
  df_filtered = df[df['word_count'] >= min_word_count].copy()
763
  df_filtered.reset_index(drop=True, inplace=True)
 
764
  print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
 
765
  if len(df_filtered) < 2:
766
  return pd.DataFrame(), [], df_combined
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
767
 
768
+ vectorizer = TfidfVectorizer()
769
 
770
+ # Similarity calculated differently if comparing between files only (inter_file_only==True), or within the same file
771
+ if inter_file_only:
772
+ # Use the new, highly efficient helper function.
773
+ base_similarity_df = _calculate_inter_file_similarity(df_filtered, vectorizer, similarity_threshold, progress)
774
+ if base_similarity_df.empty:
775
+ return pd.DataFrame(), [], df_combined
 
 
 
 
 
 
 
 
 
 
 
776
 
777
+ else:
778
+ # Use the original, simpler path for all-to-all comparisons (including intra-file).
779
+ print("Standard Path: Calculating all-to-all similarity.")
780
+ progress(0.2, desc="Vectorizing text...")
781
+ tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
782
 
783
+ progress(0.3, desc="Calculating similarity matrix...")
784
+ similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
785
+ coo_matrix = similarity_matrix.tocoo()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
 
787
+ similar_pages = [
788
+ (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
789
+ if r < c and v >= similarity_threshold
790
+ ]
791
 
792
+ if not similar_pages:
793
  return pd.DataFrame(), [], df_combined
794
+
795
+ base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
796
+
797
+ progress(0.6, desc="Aggregating results based on matching strategy")
798
 
799
+ if greedy_match or min_consecutive_pages > 1:
800
+ print("Finding all consecutive page matches of minimum length:", min_consecutive_pages)
801
+
802
+ # Sort the dataframe to ensure consecutive pages are adjacent
803
+ similarity_df = base_similarity_df #.sort_values(['Page1_Index', 'Page2_Index']).copy()
804
 
805
+ # A new sequence starts if the difference from the previous row is not (1, 1)
806
+ # is_consecutive will be True if a row continues the sequence, False if it's a new one.
 
 
 
807
  is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
808
+
809
+ # Use cumsum() on the inverted boolean series to create a unique ID for each block.
810
+ # Every time a 'False' appears (a new block starts), the sum increases.
811
  block_id = is_consecutive.eq(False).cumsum()
812
+
813
+ # Group by this block ID
814
  grouped = similarity_df.groupby(block_id)
815
+
816
+ # Aggregate each group to get the start, end, and length of the match
817
  agg_results = grouped.agg(
818
+ Page1_Start_Index=('Page1_Index', 'first'),
819
+ Page2_Start_Index=('Page2_Index', 'first'),
820
+ Page1_End_Index=('Page1_Index', 'last'),
821
+ Page2_End_Index=('Page2_Index', 'last'),
822
+ Match_Length=('Page1_Index', 'size'),
823
+ Avg_Similarity=('Similarity_Score', 'mean')
824
  ).reset_index(drop=True)
 
 
825
 
826
+ # If greedy_match=True, we keep all matches. If min_consecutive_pages > 1, we filter.
827
+ if greedy_match and min_consecutive_pages <= 1:
828
+ subdocument_df = agg_results
829
+ else:
830
+ # This handles the case for min_consecutive_pages > 1
831
+ subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
832
+
833
+ if subdocument_df.empty:
834
+ gr.Info("No matches found")
835
+ return pd.DataFrame(), [], df_combined
836
+
837
+ final_df = map_metadata_subdocument(subdocument_df, df_filtered)
838
  else:
839
+ print(f"Finding single page matches, not greedy (min_consecutive_pages=1)")
840
+ # This part of your code would handle the non-sequential case
841
  final_df = map_metadata_single_page(base_similarity_df, df_filtered)
842
+ subdocument_df = final_df # To align variable names for saving
843
+
844
+ if subdocument_df.empty:
845
+ gr.Info("No matches found")
846
+ return pd.DataFrame(), [], df_combined
847
 
 
 
 
 
 
848
  progress(0.8, desc="Saving output files")
849
 
850
  output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
 
901
  Wrapper function updated to include the 'greedy_match' boolean.
902
  """
903
  if not files:
904
+ raise Warning("Please upload files to analyse.")
 
905
 
906
  progress(0, desc="Combining input files...")
907
+ df_combined, _, full_out_ocr_df = combine_ocr_output_text(files, combine_pages=combine_pages)
908
 
909
  if df_combined.empty:
910
+ raise Warning("No data found in the uploaded files.")
 
911
 
912
  # Call the main analysis function with the new parameter
913
+ results_df, output_paths, full_df = identify_similar_text_sequences(
914
  df_combined=df_combined,
915
  similarity_threshold=threshold,
916
  min_word_count=min_words,
 
922
 
923
  # Clip text to first 200 characters
924
  full_df['text'] = full_df['text'].str[:preview_length]
 
925
  # Preprocess full_data (without preview text) for fast access (run once)
926
  full_data_by_file = {
927
  file: df.sort_values('page').set_index('page')
 
931
  if results_df.empty:
932
  gr.Info(f"No duplicate pages found, no results returned.")
933
 
934
+ return results_df, output_paths, full_data_by_file
935
 
936
  def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: gr.SelectData, preview_length:int=500):
937
  """
 
1016
 
1017
  return all_annotations, newly_added_annotation_group
1018
 
1019
+ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=list()):
1020
  '''
1021
+ This function applies redactions to whole pages based on a provided list of duplicate page numbers. It supports two modes of operation: combining pages and not combining pages. When combining pages is enabled, it attempts to identify duplicate pages across different files and applies redactions accordingly. If combining pages is disabled, it relies on new annotations with bounding boxes to determine which pages to redact. The function utilises a PyMuPDF document object to manipulate the PDF file, and it also considers the sizes of pages to ensure accurate redaction application.
1022
+
1023
+ Args:
1024
+ duplicate_page_numbers_df (pd.DataFrame): A DataFrame containing page numbers identified as duplicates.
1025
+ doc_file_name_with_extension_textbox (str): The name of the document file with its extension.
1026
+ review_file_state (pd.DataFrame): The current state of the review file.
1027
+ duplicate_output_paths (list[str]): A list of paths to files containing duplicate page information.
1028
+ pymupdf_doc (object): A PyMuPDF document object representing the PDF file.
1029
+ page_sizes (list[dict]): A list of dictionaries containing page size information.
1030
+ all_existing_annotations (list[dict]): A list of all existing annotations in the document.
1031
+ combine_pages (bool, optional): A flag indicating whether to combine pages for redaction. Defaults to True.
1032
+ new_annotations_with_bounding_boxes (List[dict], optional): A list of new annotations with bounding boxes. Defaults to an empty list.
1033
  '''
1034
+ if all_existing_annotations is None:
1035
+ all_existing_annotations = []
1036
+
1037
+ if new_annotations_with_bounding_boxes is None:
1038
+ new_annotations_with_bounding_boxes = []
1039
+
1040
+ print("new_annotations_with_bounding_boxes:", new_annotations_with_bounding_boxes)
1041
+
1042
  all_annotations = all_existing_annotations.copy()
1043
 
1044
  if not pymupdf_doc:
 
1171
  review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
1172
  review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
1173
 
1174
+ out_message = "Successfully created duplicate text redactions."
1175
  print(out_message)
1176
  gr.Info(out_message)
1177
 
1178
  return review_file_out, all_annotations
1179
 
 
1180
  def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
1181
+ """Parses a combined ID using modular arithmetic."""
1182
+ if int(combined_id) < ID_MULTIPLIER:
1183
+ # Handle cases where page is 0 (or just an edge case)
1184
+ return 0, combined_id
1185
+
1186
+ page = combined_id // ID_MULTIPLIER
1187
+ line = combined_id % ID_MULTIPLIER
 
 
 
 
 
1188
  return page, line
1189
 
1190
  def create_annotation_objects_from_duplicates(
1191
  duplicates_df: pd.DataFrame,
1192
  ocr_results_df: pd.DataFrame,
1193
  page_sizes: List[Dict],
1194
+ combine_pages:bool=False) -> List[Dict]:
 
1195
  """
1196
  Creates structured annotation objects from duplicate line ranges, mapping
1197
  page numbers to image paths.
 
1207
  """
1208
  final_output = []
1209
 
1210
+ if duplicates_df.empty:
1211
+ raise Warning("No duplicates found")
1212
+ if ocr_results_df.empty:
1213
+ raise Warning("No OCR results found for file under review. Please upload relevant OCR_output file for the PDF file on the review tab.")
1214
+
1215
  if combine_pages == False:
 
1216
  page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
1217
 
1218
  # Prepare OCR Data: Add a line number column if it doesn't exist
tools/helper_functions.py CHANGED
@@ -22,7 +22,7 @@ def reset_state_vars():
22
  show_share_button=False,
23
  show_remove_button=False,
24
  interactive=False
25
- ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0
26
 
27
  def reset_ocr_results_state():
28
  return pd.DataFrame(), pd.DataFrame(), []
@@ -573,7 +573,10 @@ def reset_base_dataframe(df:pd.DataFrame):
573
  return df
574
 
575
  def reset_ocr_base_dataframe(df:pd.DataFrame):
576
- return df.iloc[:, [0,1]]
 
 
 
577
 
578
  def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_redaction_value:str):
579
 
 
22
  show_share_button=False,
23
  show_remove_button=False,
24
  interactive=False
25
+ ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0, []
26
 
27
  def reset_ocr_results_state():
28
  return pd.DataFrame(), pd.DataFrame(), []
 
573
  return df
574
 
575
  def reset_ocr_base_dataframe(df:pd.DataFrame):
576
+ if df.empty:
577
+ return pd.DataFrame()
578
+ else:
579
+ return df.loc[:, ["page", "text"]]
580
 
581
  def reset_ocr_with_words_base_dataframe(df:pd.DataFrame, page_entity_dropdown_redaction_value:str):
582
 
tools/redaction_review.py CHANGED
@@ -404,6 +404,68 @@ def _generate_unique_ids(
404
 
405
  return list(newly_generated_ids)
406
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
407
  def create_annotation_objects_from_filtered_ocr_results_with_words(
408
  filtered_ocr_results_with_words_df: pd.DataFrame,
409
  ocr_results_with_words_df_base: pd.DataFrame,
@@ -411,54 +473,45 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
411
  existing_annotations_df: pd.DataFrame,
412
  existing_annotations_list: List[Dict],
413
  existing_recogniser_entity_df: pd.DataFrame,
414
- progress=gr.Progress(track_tqdm=True)
415
  ) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
416
  """
417
- Creates and merges new annotations, using custom ID logic.
 
 
 
 
 
 
 
 
 
 
 
 
418
  """
419
 
 
420
  print("Identifying new redactions to add")
421
- progress(0.1, "Identifying new redactions to add")
422
  if filtered_ocr_results_with_words_df.empty:
423
  print("No new annotations to add.")
424
  updated_annotations_df = existing_annotations_df.copy()
425
  else:
426
- # join_keys = ['page', 'line', 'word_text', 'word_x0']
427
- # new_annotations_df = pd.merge(
428
- # ocr_results_with_words_df_base,
429
- # filtered_ocr_results_with_words_df[join_keys],
430
- # on=join_keys,
431
- # how='inner'
432
- # )
433
-
434
  filtered_ocr_results_with_words_df.index = filtered_ocr_results_with_words_df["index"]
435
-
436
  new_annotations_df = ocr_results_with_words_df_base.loc[filtered_ocr_results_with_words_df.index].copy()
437
 
438
  if new_annotations_df.empty:
439
  print("No new annotations to add.")
440
  updated_annotations_df = existing_annotations_df.copy()
441
  else:
442
- # --- Custom ID Generation ---
443
- progress(0.2, "Creating new redaction IDs")
444
- # 1. Get all IDs that already exist to ensure we don't create duplicates.
445
- existing_ids = set()
446
- if 'id' in existing_annotations_df.columns:
447
- existing_ids = set(existing_annotations_df['id'].dropna())
448
-
449
- # 2. Generate the exact number of new, unique IDs required.
450
- num_new_ids = len(new_annotations_df)
451
- new_id_list = _generate_unique_ids(num_new_ids, existing_ids)
452
-
453
- # 3. Assign the new IDs and other columns in a vectorized way.
454
  page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
455
 
456
- progress(0.4, "Assigning new redaction details to dataframe")
457
  new_annotations_df = new_annotations_df.assign(
458
  image=lambda df: df['page'].map(page_to_image_map),
459
  label="Redaction",
460
- color='(0, 0, 0)',
461
- id=new_id_list # Assign the pre-generated list of unique IDs
462
  ).rename(columns={
463
  'word_x0': 'xmin',
464
  'word_y0': 'ymin',
@@ -466,6 +519,17 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
466
  'word_y1': 'ymax',
467
  'word_text': 'text'
468
  })
 
 
 
 
 
 
 
 
 
 
 
469
 
470
  annotation_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
471
  new_annotations_df = new_annotations_df[annotation_cols]
@@ -477,7 +541,7 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
477
  if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
478
  unique_new_df = new_annotations_df
479
  else:
480
- # I'm not doing checks on this anymore
481
  # merged = pd.merge(
482
  # new_annotations_df,
483
  # existing_annotations_df[key_cols].drop_duplicates(),
@@ -508,16 +572,39 @@ def create_annotation_objects_from_filtered_ocr_results_with_words(
508
  merged_df = pd.merge(all_pages_df[['image']], updated_annotations_df, on='image', how='left')
509
  else:
510
  merged_df = all_pages_df[['image']]
511
-
 
 
 
 
 
 
 
 
 
 
 
 
 
512
  final_annotations_list = []
513
  box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
514
 
515
- for image_path, group in progress.tqdm(merged_df.groupby('image'), desc="Adding redaction boxes to annotation object"):
 
 
 
 
 
 
 
 
516
  if pd.isna(group.iloc[0].get('id')):
517
  boxes = []
518
  else:
519
  valid_box_cols = [col for col in box_cols if col in group.columns]
520
- boxes = group[valid_box_cols].to_dict('records')
 
 
521
 
522
  final_annotations_list.append({
523
  "image": image_path,
 
404
 
405
  return list(newly_generated_ids)
406
 
407
+ def _merge_horizontally_adjacent_boxes(
408
+ df: pd.DataFrame,
409
+ x_merge_threshold: int = 0.02
410
+ ) -> pd.DataFrame:
411
+ """
412
+ Merges horizontally adjacent bounding boxes within the same line.
413
+
414
+ Args:
415
+ df (pd.DataFrame): DataFrame containing annotation boxes with columns
416
+ like 'page', 'line', 'xmin', 'xmax', etc.
417
+ x_merge_threshold (int): The maximum pixel gap on the x-axis to
418
+ consider two boxes as adjacent.
419
+
420
+ Returns:
421
+ pd.DataFrame: A new DataFrame with adjacent boxes merged.
422
+ """
423
+ if df.empty:
424
+ return df
425
+
426
+ # 1. Sort values to ensure we are comparing adjacent boxes
427
+ df_sorted = df.sort_values(by=['page', 'line', 'xmin']).copy()
428
+
429
+ # 2. Identify groups of boxes to merge using shift() and cumsum()
430
+ # Get properties of the 'previous' box in the sorted list
431
+ prev_xmax = df_sorted['xmax'].shift(1)
432
+ prev_page = df_sorted['page'].shift(1)
433
+ prev_line = df_sorted['line'].shift(1)
434
+
435
+ # A box should be merged with the previous one if it's on the same page/line
436
+ # and the horizontal gap is within the threshold.
437
+ is_adjacent = (
438
+ (df_sorted['page'] == prev_page) &
439
+ (df_sorted['line'] == prev_line) &
440
+ (df_sorted['xmin'] - prev_xmax <= x_merge_threshold)
441
+ )
442
+
443
+ # A new group starts wherever a box is NOT adjacent to the previous one.
444
+ # cumsum() on this boolean series creates a unique ID for each group.
445
+ df_sorted['merge_group'] = (~is_adjacent).cumsum()
446
+
447
+ # 3. Aggregate each group into a single bounding box
448
+ # Define how to aggregate each column
449
+ agg_funcs = {
450
+ 'xmin': 'min',
451
+ 'ymin': 'min', # To get the highest point of the combined box
452
+ 'xmax': 'max',
453
+ 'ymax': 'max', # To get the lowest point of the combined box
454
+ 'text': lambda s: ' '.join(s.astype(str)), # Join the text
455
+ # Carry over the first value for columns that are constant within a group
456
+ 'page': 'first',
457
+ 'line': 'first',
458
+ 'image': 'first',
459
+ 'label': 'first',
460
+ 'color': 'first',
461
+ }
462
+
463
+ merged_df = df_sorted.groupby('merge_group').agg(agg_funcs).reset_index(drop=True)
464
+
465
+ print(f"Merged {len(df)} annotations into {len(merged_df)}.")
466
+
467
+ return merged_df
468
+
469
  def create_annotation_objects_from_filtered_ocr_results_with_words(
470
  filtered_ocr_results_with_words_df: pd.DataFrame,
471
  ocr_results_with_words_df_base: pd.DataFrame,
 
473
  existing_annotations_df: pd.DataFrame,
474
  existing_annotations_list: List[Dict],
475
  existing_recogniser_entity_df: pd.DataFrame,
476
+ progress = gr.Progress(track_tqdm=True)
477
  ) -> Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
478
  """
479
+ This function processes filtered OCR results with words to create new annotation objects. It merges these new annotations with existing ones, ensuring that horizontally adjacent boxes are combined for cleaner redactions. The function also updates the existing recogniser entity DataFrame and returns the updated annotations in both DataFrame and list-of-dicts formats.
480
+
481
+ Args:
482
+ filtered_ocr_results_with_words_df (pd.DataFrame): A DataFrame containing filtered OCR results with words.
483
+ ocr_results_with_words_df_base (pd.DataFrame): The base DataFrame of OCR results with words.
484
+ page_sizes (List[Dict]): A list of dictionaries containing page sizes.
485
+ existing_annotations_df (pd.DataFrame): A DataFrame of existing annotations.
486
+ existing_annotations_list (List[Dict]): A list of dictionaries representing existing annotations.
487
+ existing_recogniser_entity_df (pd.DataFrame): A DataFrame of existing recogniser entities.
488
+ progress (gr.Progress, optional): A progress tracker. Defaults to gr.Progress(track_tqdm=True).
489
+
490
+ Returns:
491
+ Tuple[List[Dict], List[Dict], pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: A tuple containing the updated annotations list, updated existing annotations list, updated annotations DataFrame, updated existing annotations DataFrame, updated recogniser entity DataFrame, and the original existing recogniser entity DataFrame.
492
  """
493
 
494
+ progress(0.2, "Identifying new redactions to add")
495
  print("Identifying new redactions to add")
 
496
  if filtered_ocr_results_with_words_df.empty:
497
  print("No new annotations to add.")
498
  updated_annotations_df = existing_annotations_df.copy()
499
  else:
500
+ # Assuming index relationship holds for fast lookup
 
 
 
 
 
 
 
501
  filtered_ocr_results_with_words_df.index = filtered_ocr_results_with_words_df["index"]
 
502
  new_annotations_df = ocr_results_with_words_df_base.loc[filtered_ocr_results_with_words_df.index].copy()
503
 
504
  if new_annotations_df.empty:
505
  print("No new annotations to add.")
506
  updated_annotations_df = existing_annotations_df.copy()
507
  else:
 
 
 
 
 
 
 
 
 
 
 
 
508
  page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
509
 
510
+ # Prepare the initial new annotations DataFrame
511
  new_annotations_df = new_annotations_df.assign(
512
  image=lambda df: df['page'].map(page_to_image_map),
513
  label="Redaction",
514
+ color='(0, 0, 0)'
 
515
  ).rename(columns={
516
  'word_x0': 'xmin',
517
  'word_y0': 'ymin',
 
519
  'word_y1': 'ymax',
520
  'word_text': 'text'
521
  })
522
+
523
+ progress(0.3, "Checking for adjacent annotations to merge...")
524
+ print("Checking for adjacent annotations to merge...")
525
+ new_annotations_df = _merge_horizontally_adjacent_boxes(new_annotations_df)
526
+
527
+ progress(0.4, "Creating new redaction IDs...")
528
+ print("Creating new redaction IDs...")
529
+ existing_ids = set(existing_annotations_df['id'].dropna()) if 'id' in existing_annotations_df.columns else set()
530
+ num_new_ids = len(new_annotations_df)
531
+ new_id_list = _generate_unique_ids(num_new_ids, existing_ids)
532
+ new_annotations_df['id'] = new_id_list
533
 
534
  annotation_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
535
  new_annotations_df = new_annotations_df[annotation_cols]
 
541
  if existing_annotations_df.empty or not all(col in existing_annotations_df.columns for col in key_cols):
542
  unique_new_df = new_annotations_df
543
  else:
544
+ # I'm not doing checks against existing as it is too compute intensive in large documents
545
  # merged = pd.merge(
546
  # new_annotations_df,
547
  # existing_annotations_df[key_cols].drop_duplicates(),
 
572
  merged_df = pd.merge(all_pages_df[['image']], updated_annotations_df, on='image', how='left')
573
  else:
574
  merged_df = all_pages_df[['image']]
575
+
576
+ # 1. Get the list of image paths in the exact order they appear in page_sizes.
577
+ # all_pages_df was created from page_sizes, so it preserves this order.
578
+ image_order = all_pages_df['image'].tolist()
579
+
580
+ # 2. Convert the 'image' column to a special 'Categorical' type.
581
+ # This tells pandas that this column has a custom, non-alphabetical order.
582
+ merged_df['image'] = pd.Categorical(merged_df['image'], categories=image_order, ordered=True)
583
+
584
+ # 3. Sort the DataFrame based on this new custom order.
585
+ merged_df = merged_df.sort_values('image')
586
+
587
+ # --- NEW CODE END ---
588
+
589
  final_annotations_list = []
590
  box_cols = ['label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
591
 
592
+ # Now, when we group, we use `sort=False`. This tells groupby to respect the
593
+ # DataFrame's current order, which we have just manually set. This is slightly
594
+ # more efficient than letting it sort again.
595
+ for image_path, group in merged_df.groupby('image', sort=False):
596
+ # The progress.tqdm wrapper can be added back around the groupby object as you had it.
597
+ # for image_path, group in progress.tqdm(merged_df.groupby('image', sort=False), ...):
598
+
599
+ # Check if the group has actual annotations. iloc[0] is safe because even pages
600
+ # without annotations will have one row with NaN values from the merge.
601
  if pd.isna(group.iloc[0].get('id')):
602
  boxes = []
603
  else:
604
  valid_box_cols = [col for col in box_cols if col in group.columns]
605
+ # We should also sort the boxes within a page for consistency (e.g., left-to-right)
606
+ sorted_group = group.sort_values(by=['ymin', 'xmin'])
607
+ boxes = sorted_group[valid_box_cols].to_dict('records')
608
 
609
  final_annotations_list.append({
610
  "image": image_path,