seanpedrickcase commited on
Commit
ab04c92
·
1 Parent(s): c3d1c4c

Updated duplicate pages functionality. Improve redaction efficiency a little with concat method. Minor modification to documentation and interface

Browse files
app.py CHANGED
@@ -7,12 +7,12 @@ from tools.helper_functions import put_columns_in_df, get_connection_params, rev
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
- from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
11
  from tools.data_anonymise import anonymise_data_files
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
- from tools.find_duplicate_pages import identify_similar_pages
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
17
 
18
  # Suppress downcasting warnings
@@ -186,6 +186,7 @@ with app:
186
  # Duplicate page detection
187
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
188
  duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
 
189
 
190
  # Tracking variables for current page (not visible)
191
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -376,7 +377,8 @@ with app:
376
 
377
  with gr.Accordion("Search all extracted text", open=True):
378
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
379
- reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
 
380
 
381
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
382
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -387,13 +389,47 @@ with app:
387
  # IDENTIFY DUPLICATE PAGES TAB
388
  ###
389
  with gr.Tab(label="Identify duplicate pages"):
390
- with gr.Accordion("Identify duplicate pages to redact", open = True):
391
- in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
  with gr.Row():
393
- duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
394
- find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
 
 
 
 
 
 
395
 
396
- duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
397
 
398
  ###
399
  # TEXT / TABULAR DATA TAB
@@ -621,7 +657,8 @@ with app:
621
  # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
622
  recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
623
  success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
624
- success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page])
 
625
 
626
  reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
627
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
@@ -653,7 +690,10 @@ with app:
653
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
654
 
655
  # Review OCR text button
656
- all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row])
 
 
 
657
  reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
658
 
659
  # Convert review file to xfdf Adobe format
@@ -684,7 +724,27 @@ with app:
684
  ###
685
  # IDENTIFY DUPLICATE PAGES
686
  ###
687
- find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages, duplicate_threshold_value, output_folder_textbox], outputs=[duplicate_pages_df, duplicate_pages_out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
688
 
689
  ###
690
  # SETTINGS PAGE INPUT / OUTPUT
 
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
+ from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top
11
  from tools.data_anonymise import anonymise_data_files
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
+ from tools.find_duplicate_pages import run_analysis, show_page_previews
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
17
 
18
  # Suppress downcasting warnings
 
186
  # Duplicate page detection
187
  in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
188
  duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
189
+ full_data_state = gr.State() # Full data for deduplication process
190
 
191
  # Tracking variables for current page (not visible)
192
  current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
 
377
 
378
  with gr.Accordion("Search all extracted text", open=True):
379
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
380
+ reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
381
+ selected_ocr_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "text":[]}), col_count=2, type="pandas", visible=False, headers=["page", "text"], wrap=True)
382
 
383
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
384
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
 
389
  # IDENTIFY DUPLICATE PAGES TAB
390
  ###
391
  with gr.Tab(label="Identify duplicate pages"):
392
+ with gr.Accordion("Step 1: Configure and Run Analysis", open = True):
393
+ in_duplicate_pages = gr.File(
394
+ label="Upload multiple 'ocr_output.csv' files to compare",
395
+ file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv']
396
+ )
397
+
398
+ gr.Markdown("#### Matching Parameters")
399
+ with gr.Row():
400
+ duplicate_threshold_input = gr.Number(value=0.95, label="Similarity Threshold", info="Score (0-1) to consider pages a match.")
401
+ min_word_count_input = gr.Number(value=10, label="Min Word Count", info="Pages with fewer words are ignored.")
402
+
403
+ gr.Markdown("#### Matching Strategy")
404
+ greedy_match_input = gr.Checkbox(
405
+ label="Enable 'Greedy' Consecutive Matching",
406
+ value=False,
407
+ info="If checked, finds the longest possible sequence of matching pages starting from any single match. Overrides the slider below."
408
+ )
409
+ min_consecutive_pages_input = gr.Slider(
410
+ minimum=1, maximum=20, value=1, step=1,
411
+ label="Minimum Consecutive Pages (for non-greedy mode)",
412
+ info="If Greedy Matching is off, use this to find sequences of a fixed minimum length."
413
+ )
414
+
415
+ find_duplicate_pages_btn = gr.Button(value="Identify Duplicate Pages", variant="primary")
416
+
417
+ with gr.Accordion("Step 2: Review Results", open=True):
418
+ gr.Markdown("### Analysis Summary\nClick on a row below to see the full page text.")
419
+ results_df_preview = gr.DataFrame(label="Similarity Results", interactive=True)
420
+
421
+ gr.Markdown("### Full Text Preview")
422
  with gr.Row():
423
+ page1_text_preview = gr.DataFrame(label="Match Source (Document 1)")
424
+ page2_text_preview = gr.DataFrame(label="Match Duplicate (Document 2)")
425
+
426
+ gr.Markdown("### Downloadable Files")
427
+ duplicate_pages_out = gr.File(
428
+ label="Download analysis summary and redaction lists (.csv)",
429
+ file_count="multiple", height=FILE_INPUT_HEIGHT
430
+ )
431
 
432
+ # Here, it would be good to call the redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, custom_colours:bool, border:float = 5) function, where each call creates a single image annotation box. page_sizes_df could be used here potentially to create size inputs. Maybe a bool could be added to exclude the actual pymupdf page box redaction, so that Page can be put in as a placeholder. The function convert annotation df to review df could then, concat to the existing review df, to update the existing review df with new full page redactions.
433
 
434
  ###
435
  # TEXT / TABULAR DATA TAB
 
657
  # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
658
  recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
659
  success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
660
+ success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page]).\
661
+ success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
662
 
663
  reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
664
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
 
690
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
691
 
692
  # Review OCR text button
693
+ all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_ocr_dataframe_row]).\
694
+ success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page]).\
695
+ success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
696
+
697
  reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
698
 
699
  # Convert review file to xfdf Adobe format
 
724
  ###
725
  # IDENTIFY DUPLICATE PAGES
726
  ###
727
+ find_duplicate_pages_btn.click(
728
+ fn=run_analysis,
729
+ inputs=[
730
+ in_duplicate_pages,
731
+ duplicate_threshold_input,
732
+ min_word_count_input,
733
+ min_consecutive_pages_input,
734
+ greedy_match_input
735
+ ],
736
+ outputs=[
737
+ results_df_preview,
738
+ duplicate_pages_out,
739
+ full_data_state
740
+ ]
741
+ )
742
+
743
+ results_df_preview.select(
744
+ fn=show_page_previews,
745
+ inputs=[full_data_state, results_df_preview],
746
+ outputs=[page1_text_preview, page2_text_preview]
747
+ )
748
 
749
  ###
750
  # SETTINGS PAGE INPUT / OUTPUT
index.qmd ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: "Home"
3
+ ---
4
+
5
+ version: 0.7.0
6
+
7
+ Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
8
+
9
+ Navigate through the sections to learn how to install, use, and manage the application. Below is a brief introduction to the app.
10
+
11
+ ## Document redaction
12
+
13
+ Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](src/user_guide.qmd) for a walkthrough on how to use the app.
14
+
15
+ ![Handwriting and signatures redacted example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/refs/heads/main/review_redactions/Signatures%20and%20handwriting%20found.PNG)
16
+
17
+ To identify text in documents, the app provides several options. 'Local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. The app then identifies personal information to redaction. The 'Local' is based on spaCy, is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
18
+
19
+ After redaction, suggested redactions can be reviewed and modified on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
20
+
21
+ NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
22
+
23
+
tools/aws_functions.py CHANGED
@@ -228,5 +228,6 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
228
  print(final_out_message_str)
229
  else:
230
  final_out_message_str = "App not set to run AWS functions"
 
231
 
232
  return final_out_message_str
 
228
  print(final_out_message_str)
229
  else:
230
  final_out_message_str = "App not set to run AWS functions"
231
+ print(final_out_message_str)
232
 
233
  return final_out_message_str
tools/file_conversion.py CHANGED
@@ -385,24 +385,22 @@ def convert_pymupdf_to_image_coords(pymupdf_page:Page, x1:float, y1:float, x2:fl
385
 
386
  return x1_image, y1_image, x2_image, y2_image
387
 
388
- def redact_whole_pymupdf_page(rect_height:float, rect_width:float, image:Image, page:Page, custom_colours, border:float = 5, image_dimensions:dict={}):
389
  # Small border to page that remains white
390
  border = 5
391
  # Define the coordinates for the Rect
392
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
393
  whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
394
 
395
- # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image, image_dimensions=image_dimensions)
396
-
397
  # Create new image annotation element based on whole page coordinates
398
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
399
 
400
  # Write whole page annotation to annotation boxes
401
  whole_page_img_annotation_box = {}
402
- whole_page_img_annotation_box["xmin"] = whole_page_x1 #whole_page_image_x1
403
- whole_page_img_annotation_box["ymin"] = whole_page_y1 #whole_page_image_y1
404
- whole_page_img_annotation_box["xmax"] = whole_page_x2 #whole_page_image_x2
405
- whole_page_img_annotation_box["ymax"] = whole_page_y2 #whole_page_image_y2
406
  whole_page_img_annotation_box["color"] = (0,0,0)
407
  whole_page_img_annotation_box["label"] = "Whole page"
408
 
 
385
 
386
  return x1_image, y1_image, x2_image, y2_image
387
 
388
+ def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, custom_colours:bool, border:float = 5):
389
  # Small border to page that remains white
390
  border = 5
391
  # Define the coordinates for the Rect
392
  whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
393
  whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
394
 
 
 
395
  # Create new image annotation element based on whole page coordinates
396
  whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
397
 
398
  # Write whole page annotation to annotation boxes
399
  whole_page_img_annotation_box = {}
400
+ whole_page_img_annotation_box["xmin"] = whole_page_x1
401
+ whole_page_img_annotation_box["ymin"] = whole_page_y1
402
+ whole_page_img_annotation_box["xmax"] = whole_page_x2
403
+ whole_page_img_annotation_box["ymax"] = whole_page_y2
404
  whole_page_img_annotation_box["color"] = (0,0,0)
405
  whole_page_img_annotation_box["label"] = "Whole page"
406
 
tools/file_redaction.py CHANGED
@@ -1114,7 +1114,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
1114
  # If whole page is to be redacted, do that here
1115
  if redact_whole_page == True:
1116
 
1117
- whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5, image_dimensions=image_dimensions)
1118
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
1119
 
1120
  out_annotation_boxes = {
@@ -1372,10 +1372,19 @@ def redact_image_pdf(file_path:str,
1372
  if current_loop_page == 0: page_loop_start = 0
1373
  else: page_loop_start = current_loop_page
1374
 
1375
- progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1376
 
1377
- all_line_level_ocr_results_df_list = [all_line_level_ocr_results_df]
1378
- all_pages_decision_process_table_list = [all_pages_decision_process_table]
 
 
 
 
 
 
 
 
 
1379
 
1380
  # Go through each page
1381
  for page_no in progress_bar:
@@ -1525,7 +1534,10 @@ def redact_image_pdf(file_path:str,
1525
  'height': result.height
1526
  } for result in page_line_level_ocr_results['results']])
1527
 
1528
- all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
 
 
 
1529
 
1530
  if pii_identification_method != NO_REDACTION_PII_OPTION:
1531
  # Step 2: Analyse text and identify PII
@@ -1637,7 +1649,10 @@ def redact_image_pdf(file_path:str,
1637
  'page': reported_page_number
1638
  } for result in page_merged_redaction_bboxes])
1639
 
1640
- all_pages_decision_process_table_list.append(decision_process_table)
 
 
 
1641
 
1642
  decision_process_table = fill_missing_ids(decision_process_table)
1643
  decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
@@ -1685,8 +1700,11 @@ def redact_image_pdf(file_path:str,
1685
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1686
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1687
 
1688
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1689
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
 
 
 
1690
 
1691
 
1692
  current_loop_page += 1
@@ -1733,9 +1751,11 @@ def redact_image_pdf(file_path:str,
1733
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1734
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1735
 
 
 
1736
 
1737
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1738
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
1739
 
1740
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1741
 
@@ -1758,8 +1778,8 @@ def redact_image_pdf(file_path:str,
1758
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1759
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1760
 
1761
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1762
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
1763
 
1764
  # Convert decision table and ocr results to relative coordinates
1765
  all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
@@ -2002,11 +2022,11 @@ def redact_text_pdf(
2002
  tic = time.perf_counter()
2003
 
2004
  if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
2005
- all_line_level_ocr_results_df_list = [all_line_level_ocr_results_df]
2006
 
2007
  if isinstance(all_pages_decision_process_table, pd.DataFrame):
2008
  # Convert decision outputs to list of dataframes:
2009
- all_pages_decision_process_table_list = [all_pages_decision_process_table]
2010
 
2011
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
2012
  out_message = "Connection to AWS Comprehend service not found."
@@ -2133,7 +2153,7 @@ def redact_text_pdf(
2133
  page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
2134
 
2135
  if not page_decision_process_table.empty:
2136
- all_pages_decision_process_table_list.append(page_decision_process_table)
2137
 
2138
  # Else, user chose not to run redaction
2139
  else:
@@ -2145,7 +2165,7 @@ def redact_text_pdf(
2145
  if not page_text_ocr_outputs.empty:
2146
  page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
2147
  page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
2148
- all_line_level_ocr_results_df_list.append(page_text_ocr_outputs)
2149
 
2150
  toc = time.perf_counter()
2151
 
@@ -2168,8 +2188,8 @@ def redact_text_pdf(
2168
  annotations_all_pages.append(page_image_annotations)
2169
 
2170
  # Write logs
2171
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2172
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
2173
 
2174
 
2175
  current_loop_page += 1
@@ -2193,16 +2213,16 @@ def redact_text_pdf(
2193
  progress.close(_tqdm=progress_bar)
2194
 
2195
  # Write logs
2196
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2197
 
2198
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2199
 
2200
  # Write all page outputs
2201
- all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2202
 
2203
- #print("all_line_level_ocr_results_df_list:", all_line_level_ocr_results_df_list)
2204
 
2205
- all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
2206
 
2207
  #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
2208
 
 
1114
  # If whole page is to be redacted, do that here
1115
  if redact_whole_page == True:
1116
 
1117
+ whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, page, custom_colours, border = 5)
1118
  all_image_annotation_boxes.append(whole_page_img_annotation_box)
1119
 
1120
  out_annotation_boxes = {
 
1372
  if current_loop_page == 0: page_loop_start = 0
1373
  else: page_loop_start = current_loop_page
1374
 
1375
+ progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1376
 
1377
+ # If there's data from a previous run (passed in via the DataFrame parameters), add it
1378
+ all_line_level_ocr_results_list = []
1379
+ all_pages_decision_process_list = []
1380
+
1381
+ if not all_line_level_ocr_results_df.empty:
1382
+ all_line_level_ocr_results_list.extend(all_line_level_ocr_results_df.to_dict('records'))
1383
+ if not all_pages_decision_process_table.empty:
1384
+ all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
1385
+
1386
+ #all_line_level_ocr_results_list = [all_line_level_ocr_results_df.to_dict('records')]#[all_line_level_ocr_results_df]
1387
+ #all_pages_decision_process_list = [all_pages_decision_process_table.to_dict('records')]#[all_pages_decision_process_table]
1388
 
1389
  # Go through each page
1390
  for page_no in progress_bar:
 
1534
  'height': result.height
1535
  } for result in page_line_level_ocr_results['results']])
1536
 
1537
+ #all_line_level_ocr_results_list.append(line_level_ocr_results_df.to_dict('records'))
1538
+
1539
+ if not line_level_ocr_results_df.empty: # Ensure there are records to add
1540
+ all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
1541
 
1542
  if pii_identification_method != NO_REDACTION_PII_OPTION:
1543
  # Step 2: Analyse text and identify PII
 
1649
  'page': reported_page_number
1650
  } for result in page_merged_redaction_bboxes])
1651
 
1652
+ #all_pages_decision_process_list.append(decision_process_table.to_dict('records'))
1653
+
1654
+ if not decision_process_table.empty: # Ensure there are records to add
1655
+ all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
1656
 
1657
  decision_process_table = fill_missing_ids(decision_process_table)
1658
  decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
 
1700
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1701
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1702
 
1703
+ #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
1704
+ #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
1705
+
1706
+ all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
1707
+ all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
1708
 
1709
 
1710
  current_loop_page += 1
 
1751
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1752
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1753
 
1754
+ #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
1755
+ #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
1756
 
1757
+ all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
1758
+ all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
1759
 
1760
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1761
 
 
1778
  if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1779
  log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1780
 
1781
+ all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list) #pd.concat(all_pages_decision_process_list)
1782
+ all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list) #pd.concat(all_line_level_ocr_results_list)
1783
 
1784
  # Convert decision table and ocr results to relative coordinates
1785
  all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
 
2022
  tic = time.perf_counter()
2023
 
2024
  if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
2025
+ all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
2026
 
2027
  if isinstance(all_pages_decision_process_table, pd.DataFrame):
2028
  # Convert decision outputs to list of dataframes:
2029
+ all_pages_decision_process_list = [all_pages_decision_process_table]
2030
 
2031
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
2032
  out_message = "Connection to AWS Comprehend service not found."
 
2153
  page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
2154
 
2155
  if not page_decision_process_table.empty:
2156
+ all_pages_decision_process_list.append(page_decision_process_table)
2157
 
2158
  # Else, user chose not to run redaction
2159
  else:
 
2165
  if not page_text_ocr_outputs.empty:
2166
  page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
2167
  page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
2168
+ all_line_level_ocr_results_list.append(page_text_ocr_outputs)
2169
 
2170
  toc = time.perf_counter()
2171
 
 
2188
  annotations_all_pages.append(page_image_annotations)
2189
 
2190
  # Write logs
2191
+ all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
2192
+ all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
2193
 
2194
 
2195
  current_loop_page += 1
 
2213
  progress.close(_tqdm=progress_bar)
2214
 
2215
  # Write logs
2216
+ all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
2217
 
2218
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2219
 
2220
  # Write all page outputs
2221
+ all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
2222
 
2223
+ #print("all_line_level_ocr_results_list:", all_line_level_ocr_results_list)
2224
 
2225
+ all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
2226
 
2227
  #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
2228
 
tools/find_duplicate_pages.py CHANGED
@@ -1,32 +1,20 @@
1
  import pandas as pd
2
- #import argparse
3
- #import glob
4
  import os
5
  import re
6
  from tools.helper_functions import OUTPUT_FOLDER
7
  from sklearn.feature_extraction.text import TfidfVectorizer
8
  from sklearn.metrics.pairwise import cosine_similarity
9
- # import nltk
10
- # from nltk.corpus import stopwords
11
- # from nltk.tokenize import word_tokenize
12
- # from nltk.stem import PorterStemmer
13
- #import spacy
14
- import numpy as np
15
  import random
16
  import string
17
- from typing import List
 
18
  from gradio import Progress
 
19
 
20
- import en_core_web_lg #en_core_web_sm
21
  nlp = en_core_web_lg.load()
22
- #from tqdm import tqdm
23
-
24
- # nltk.download('punkt')
25
- # nltk.download('stopwords')
26
- # nltk.download('punkt_tab')
27
-
28
- similarity_threshold = 0.9
29
 
 
30
 
31
  def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
32
  """
@@ -133,89 +121,317 @@ def process_data(df:pd.DataFrame, column:str):
133
 
134
  return df
135
 
136
- def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  output_paths = []
 
 
 
 
 
 
138
 
139
- progress(0.1, desc="Cleaning input text")
140
-
141
- # Load and clean data
142
- df, output_files = combine_ocr_output_text(input_files)
143
- output_paths.extend(output_files)
144
- df = process_data(df, 'text') # Assume this returns 'text_clean', 'file', and 'page' columns
145
 
146
- # Vectorize text
 
 
147
  vectorizer = TfidfVectorizer()
148
- tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
149
 
150
  progress(0.3, desc="Calculating text similarity")
151
-
152
- # Compute sparse cosine similarity
153
- similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False) # Keep sparse format
154
-
155
- # Extract indices of similar pages above threshold
156
  coo_matrix = similarity_matrix.tocoo()
157
- similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])
158
-
159
- if similar_pages.size == 0:
160
- return pd.DataFrame(), output_paths # Return empty if no matches
161
 
 
 
 
 
 
 
 
 
 
162
 
 
163
 
164
- # Create a DataFrame for similar pairs
165
- similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
166
 
167
- # Remove duplicate pairs (keep one direction)
168
- similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
169
 
170
- progress(0.8, desc="Mapping back results")
171
- # Map indices to metadata
172
- # index_map = df[['file', 'page', 'text']].to_dict(orient='index')
173
- # similarity_df['Page1_File'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['file'])
174
- # similarity_df['Page2_File'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['file'])
175
- # similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['page'])
176
- # similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['page'])
177
- # similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['text'][0:200])
178
- # similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['text'][0:200])
 
 
 
179
 
180
- # Create a DataFrame with the metadata
181
- metadata_df = df[['file', 'page', 'text']].reset_index()
182
 
183
- # Merge to get the metadata for Page1
184
- similarity_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_on='index', suffixes=('', '_Page1'))
185
- similarity_df = similarity_df.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
187
- # Merge to get the metadata for Page2
188
- similarity_df = similarity_df.merge(metadata_df, left_on='Page2_Index', right_on='index', suffixes=('', '_Page2'))
189
- similarity_df = similarity_df.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
 
191
- # Optionally, drop the index columns if not needed
192
- #similarity_df = similarity_df.drop(columns=['index_Page1', 'index_Page2'])
 
193
 
 
194
 
195
- similarity_df["Similarity_Score"] = similarity_df["Similarity_Score"].round(3)
 
 
196
 
197
- # Sort results
198
- similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
199
- similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
- similarity_df_out['Page1_Text'] = similarity_df_out['Page1_Text'][0:100]
202
- similarity_df_out['Page2_Text'] = similarity_df_out['Page2_Text'][0:100]
 
 
 
 
 
203
 
204
- progress(0.8, desc="Saving output files")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
- # Save results
207
- similarity_file_output_path = output_folder + 'page_similarity_results.csv'
208
- similarity_df_out.to_csv(similarity_file_output_path, index=False)
209
- output_paths.append(similarity_file_output_path)
210
 
211
- # Save per-file redaction lists
212
- for redact_file in similarity_df_out['Page2_File'].unique():
213
- output_file_name = output_folder + redact_file + "_whole_page.csv"
214
- whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
215
- whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
216
- output_paths.append(output_file_name)
217
 
218
- return similarity_df_out, output_paths
219
 
220
  # Perturb text
221
  # Apply the perturbation function with a 10% error probability
 
1
  import pandas as pd
 
 
2
  import os
3
  import re
4
  from tools.helper_functions import OUTPUT_FOLDER
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
 
 
 
 
 
 
7
  import random
8
  import string
9
+ from typing import List, Tuple
10
+ import gradio as gr
11
  from gradio import Progress
12
+ from pathlib import Path
13
 
14
+ import en_core_web_lg
15
  nlp = en_core_web_lg.load()
 
 
 
 
 
 
 
16
 
17
+ similarity_threshold = 0.95
18
 
19
  def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
20
  """
 
121
 
122
  return df
123
 
124
+ def map_metadata_single_page(similarity_df, metadata_source_df):
125
+ """Helper to map metadata for single page results."""
126
+ metadata_df = metadata_source_df[['file', 'page', 'text']]
127
+ results_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_index=True)\
128
+ .rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
129
+ results_df = results_df.merge(metadata_df, left_on='Page2_Index', right_index=True, suffixes=('_1', '_2'))\
130
+ .rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
131
+ results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
132
+ final_df = results_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
133
+ final_df = final_df.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"])
134
+ final_df['Page1_Text'] = final_df['Page1_Text'].str[:200]
135
+ final_df['Page2_Text'] = final_df['Page2_Text'].str[:200]
136
+ return final_df
137
+
138
+
139
+ def map_metadata_subdocument(subdocument_df, metadata_source_df):
140
+ """Helper to map metadata for subdocument results."""
141
+ metadata_df = metadata_source_df[['file', 'page', 'text']]
142
+
143
+ subdocument_df = subdocument_df.merge(metadata_df, left_on='Page1_Start_Index', right_index=True)\
144
+ .rename(columns={'file': 'Page1_File', 'page': 'Page1_Start_Page', 'text': 'Page1_Text'})
145
+ subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page1_End_Index', right_index=True)\
146
+ .rename(columns={'page': 'Page1_End_Page'})
147
+ subdocument_df = subdocument_df.merge(metadata_df, left_on='Page2_Start_Index', right_index=True)\
148
+ .rename(columns={'file': 'Page2_File', 'page': 'Page2_Start_Page', 'text': 'Page2_Text'})
149
+ subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page2_End_Index', right_index=True)\
150
+ .rename(columns={'page': 'Page2_End_Page'})
151
+
152
+ cols = ['Page1_File', 'Page1_Start_Page', 'Page1_End_Page',
153
+ 'Page2_File', 'Page2_Start_Page', 'Page2_End_Page',
154
+ 'Match_Length', 'Page1_Text', 'Page2_Text']
155
+
156
+ # Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
157
+ if 'Avg_Similarity' in subdocument_df.columns:
158
+ subdocument_df['Avg_Similarity'] = subdocument_df['Avg_Similarity'].round(3)
159
+ cols.insert(7, 'Avg_Similarity')
160
+
161
+ final_df = subdocument_df[cols]
162
+ final_df = final_df.sort_values(['Page1_File', 'Page1_Start_Page', 'Page2_File', 'Page2_Start_Page'])
163
+ final_df['Page1_Text'] = final_df['Page1_Text'].str[:200]
164
+ final_df['Page2_Text'] = final_df['Page2_Text'].str[:200]
165
+ return final_df
166
+
167
+ def identify_similar_pages(
168
+ df_combined: pd.DataFrame,
169
+ similarity_threshold: float = 0.9,
170
+ min_word_count: int = 10,
171
+ min_consecutive_pages: int = 1,
172
+ greedy_match: bool = False, # NEW parameter
173
+ output_folder: str = OUTPUT_FOLDER,
174
+ progress=Progress(track_tqdm=True)
175
+ ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
176
+ """
177
+ Identifies similar pages with three possible strategies:
178
+ 1. Single Page: If greedy_match=False and min_consecutive_pages=1.
179
+ 2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
180
+ 3. Greedy Consecutive Match: If greedy_match=True.
181
+ """
182
+ # ... (Initial setup: progress, data loading/processing, word count filter) ...
183
+ # This part remains the same as before.
184
  output_paths = []
185
+ progress(0.1, desc="Processing and filtering text")
186
+ df = process_data(df_combined, 'text')
187
+ df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
188
+ original_row_count = len(df)
189
+ df_filtered = df[df['word_count'] >= min_word_count].copy()
190
+ df_filtered.reset_index(drop=True, inplace=True)
191
 
192
+ print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
 
 
 
 
 
193
 
194
+ if len(df_filtered) < 2:
195
+ return pd.DataFrame(), [], df_combined
196
+
197
  vectorizer = TfidfVectorizer()
198
+ tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
199
 
200
  progress(0.3, desc="Calculating text similarity")
201
+ similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
 
 
 
 
202
  coo_matrix = similarity_matrix.tocoo()
 
 
 
 
203
 
204
+ # Create a DataFrame of all individual page pairs above the threshold.
205
+ # This is the base for all three matching strategies.
206
+ similar_pages = [
207
+ (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
208
+ if r < c and v >= similarity_threshold
209
+ ]
210
+
211
+ if not similar_pages:
212
+ return pd.DataFrame(), [], df_combined
213
 
214
+ base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
215
 
216
+ progress(0.6, desc="Aggregating results based on matching strategy")
 
217
 
218
+ # --- NEW: Logic to select matching strategy ---
 
219
 
220
+ if greedy_match:
221
+ # --- STRATEGY 3: Greedy Consecutive Matching ---
222
+ print("Finding matches using GREEDY consecutive strategy.")
223
+
224
+ # A set of pairs for fast lookups of (page1_idx, page2_idx)
225
+ valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
226
+
227
+ # Keep track of indices that have been used in a sequence
228
+ consumed_indices_1 = set()
229
+ consumed_indices_2 = set()
230
+
231
+ all_sequences = []
232
 
233
+ # Iterate through all potential starting pairs, sorted for consistent results
234
+ sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
235
 
236
+ for _, row in sorted_pairs.iterrows():
237
+ start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
238
+
239
+ # If this pair has already been consumed by a previous sequence, skip it
240
+ if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
241
+ continue
242
+
243
+ # This is a new sequence, start expanding it
244
+ current_sequence = [(start_idx1, start_idx2)]
245
+ k = 1
246
+ while True:
247
+ next_idx1 = start_idx1 + k
248
+ next_idx2 = start_idx2 + k
249
+
250
+ # Check if the next pair in the sequence is a valid match
251
+ if (next_idx1, next_idx2) in valid_pairs_set and \
252
+ next_idx1 not in consumed_indices_1 and \
253
+ next_idx2 not in consumed_indices_2:
254
+ current_sequence.append((next_idx1, next_idx2))
255
+ k += 1
256
+ else:
257
+ # The sequence has ended
258
+ break
259
+
260
+ # Record the found sequence and mark all its pages as consumed
261
+ sequence_indices_1 = [p[0] for p in current_sequence]
262
+ sequence_indices_2 = [p[1] for p in current_sequence]
263
+
264
+ all_sequences.append({
265
+ 'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
266
+ 'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
267
+ 'Match_Length': len(current_sequence)
268
+ })
269
+
270
+ consumed_indices_1.update(sequence_indices_1)
271
+ consumed_indices_2.update(sequence_indices_2)
272
+
273
+ if not all_sequences:
274
+ return pd.DataFrame(), [], df_combined
275
+
276
+ subdocument_df = pd.DataFrame(all_sequences)
277
+ # We can add back the average similarity if needed, but it requires more lookups.
278
+ # For now, we'll omit it for simplicity in the greedy approach.
279
+ # ... (The rest is metadata mapping, same as the subdocument case)
280
+
281
+ elif min_consecutive_pages > 1:
282
+ # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
283
+ print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
284
+ similarity_df = base_similarity_df.copy()
285
+ similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
286
+ is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
287
+ block_id = is_consecutive.eq(False).cumsum()
288
+ grouped = similarity_df.groupby(block_id)
289
+ agg_results = grouped.agg(
290
+ Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
291
+ Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
292
+ Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
293
+ ).reset_index(drop=True)
294
+ subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
295
+ if subdocument_df.empty: return pd.DataFrame(), [], df_combined
296
 
297
+ else:
298
+ # --- STRATEGY 1: Single Page Matching ---
299
+ print(f"Finding single page matches (min_consecutive_pages=1)")
300
+ final_df = map_metadata_single_page(base_similarity_df, df_filtered)
301
+ # The rest of the logic (saving files) is handled after this if/else block
302
+ pass # The final_df is already prepared
303
+
304
+ # --- Map metadata and format output ---
305
+ # This block now handles the output for both subdocument strategies (2 and 3)
306
+ if greedy_match or min_consecutive_pages > 1:
307
+ final_df = map_metadata_subdocument(subdocument_df, df_filtered)
308
+
309
+ progress(0.8, desc="Saving output files")
310
+
311
+ # If no matches were found, final_df could be empty.
312
+ if final_df.empty:
313
+ print("No matches found, no output files to save.")
314
+ return final_df, [], df_combined
315
+
316
+ # --- 1. Save the main results DataFrame ---
317
+ # This file contains the detailed summary of all matches found.
318
+ similarity_file_output_path = Path(output_folder) / 'page_similarity_results.csv'
319
+ final_df.to_csv(similarity_file_output_path, index=False)
320
+ output_paths.append(str(similarity_file_output_path))
321
+ print(f"Main results saved to {similarity_file_output_path}")
322
+
323
+ # --- 2. Save per-file redaction lists ---
324
+ # These files contain a simple list of page numbers to redact for each document
325
+ # that contains duplicate content.
326
+
327
+ # We group by the file containing the duplicates ('Page2_File')
328
+ for redact_file, group in final_df.groupby('Page2_File'):
329
+ output_file_name_stem = Path(redact_file).stem
330
+ output_file_path = Path(output_folder) / f"{output_file_name_stem}_pages_to_redact.csv"
331
+
332
+ all_pages_to_redact = set()
333
+
334
+ # Check if the results are for single pages or subdocuments
335
+ is_subdocument_match = 'Page2_Start_Page' in group.columns
336
+
337
+ if is_subdocument_match:
338
+ # For subdocument matches, create a range of pages for each match
339
+ for _, row in group.iterrows():
340
+ # Generate all page numbers from the start to the end of the match
341
+ pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
342
+ all_pages_to_redact.update(pages_in_range)
343
+ else:
344
+ # For single-page matches, just add the page number
345
+ pages = group['Page2_Page'].unique()
346
+ all_pages_to_redact.update(pages)
347
+
348
+ if all_pages_to_redact:
349
+ # Create a DataFrame from the sorted list of pages to redact
350
+ redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
351
+ redaction_df.to_csv(output_file_path, header=False, index=False)
352
+ output_paths.append(str(output_file_path))
353
+ print(f"Redaction list for {redact_file} saved to {output_file_path}")
354
 
355
+ # Note: The 'combined ocr output' csv was part of the original data loading function,
356
+ # not the analysis function itself. If you need that, it should be saved within
357
+ # your `combine_ocr_output_text` function.
358
 
359
+ return final_df, output_paths, df_combined
360
 
361
+ # ==============================================================================
362
+ # GRADIO HELPER FUNCTIONS
363
+ # ==============================================================================
364
 
365
+ def run_analysis(files, threshold, min_words, min_consecutive, greedy_match, progress=gr.Progress(track_tqdm=True)):
366
+ """
367
+ Wrapper function updated to include the 'greedy_match' boolean.
368
+ """
369
+ if not files:
370
+ gr.Warning("Please upload files to analyze.")
371
+ return None, None, None
372
+
373
+ progress(0, desc="Combining input files...")
374
+ df_combined, _ = combine_ocr_output_text(files)
375
+
376
+ if df_combined.empty:
377
+ gr.Warning("No data found in the uploaded files.")
378
+ return None, None, None
379
+
380
+ # Call the main analysis function with the new parameter
381
+ results_df, output_paths, full_df = identify_similar_pages(
382
+ df_combined=df_combined,
383
+ similarity_threshold=threshold,
384
+ min_word_count=min_words,
385
+ min_consecutive_pages=int(min_consecutive),
386
+ greedy_match=greedy_match, # Pass the new boolean
387
+ progress=progress
388
+ )
389
+
390
+ return results_df, output_paths, full_df
391
 
392
+ def show_page_previews(full_data, results_df, evt: gr.SelectData):
393
+ """
394
+ Triggered when a user selects a row in the results DataFrame.
395
+ It uses the stored 'full_data' to find and display the complete text.
396
+ """
397
+ if full_data is None or results_df is None:
398
+ return None, None # Return empty dataframes if no analysis has been run
399
 
400
+ selected_row = results_df.iloc[evt.index[0]]
401
+
402
+ # Determine if it's a single page or a multi-page (subdocument) match
403
+ is_subdocument_match = 'Page1_Start_Page' in selected_row
404
+
405
+ if is_subdocument_match:
406
+ # --- Handle Subdocument Match ---
407
+ file1, start1, end1 = selected_row['Page1_File'], selected_row['Page1_Start_Page'], selected_row['Page1_End_Page']
408
+ file2, start2, end2 = selected_row['Page2_File'], selected_row['Page2_Start_Page'], selected_row['Page2_End_Page']
409
+
410
+ page1_data = full_data[
411
+ (full_data['file'] == file1) &
412
+ (full_data['page'].between(start1, end1))
413
+ ].sort_values('page')[['page', 'text']]
414
+
415
+ page2_data = full_data[
416
+ (full_data['file'] == file2) &
417
+ (full_data['page'].between(start2, end2))
418
+ ].sort_values('page')[['page', 'text']]
419
+
420
+ else:
421
+ # --- Handle Single Page Match ---
422
+ file1, page1 = selected_row['Page1_File'], selected_row['Page1_Page']
423
+ file2, page2 = selected_row['Page2_File'], selected_row['Page2_Page']
424
+
425
+ page1_data = full_data[
426
+ (full_data['file'] == file1) & (full_data['page'] == page1)
427
+ ][['page', 'text']]
428
 
429
+ page2_data = full_data[
430
+ (full_data['file'] == file2) & (full_data['page'] == page2)
431
+ ][['page', 'text']]
 
432
 
433
+ return page1_data, page2_data
 
 
 
 
 
434
 
 
435
 
436
  # Perturb text
437
  # Apply the perturbation function with a 10% error probability
tools/redaction_review.py CHANGED
@@ -1040,9 +1040,12 @@ def reset_dropdowns(df:pd.DataFrame):
1040
 
1041
  return recogniser_entities_drop, text_entities_drop, page_entities_drop
1042
 
 
 
 
1043
  def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
1044
 
1045
- row_value_page = evt.row_value[0] # This is the page number value
1046
  row_value_label = evt.row_value[1] # This is the label number value
1047
  row_value_text = evt.row_value[2] # This is the text number value
1048
  row_value_id = evt.row_value[3] # This is the text number value
@@ -1072,7 +1075,7 @@ def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
1072
 
1073
  def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
1074
 
1075
- row_value_page = evt.row_value[0] # This is the page_number value
1076
  row_value_text = evt.row_value[1] # This is the text contents
1077
 
1078
  row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
 
1040
 
1041
  return recogniser_entities_drop, text_entities_drop, page_entities_drop
1042
 
1043
+ def increase_bottom_page_count_based_on_top(page_number:int):
1044
+ return int(page_number)
1045
+
1046
  def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
1047
 
1048
+ row_value_page = int(evt.row_value[0]) # This is the page number value
1049
  row_value_label = evt.row_value[1] # This is the label number value
1050
  row_value_text = evt.row_value[2] # This is the text number value
1051
  row_value_id = evt.row_value[3] # This is the text number value
 
1075
 
1076
  def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
1077
 
1078
+ row_value_page = int(evt.row_value[0]) # This is the page_number value
1079
  row_value_text = evt.row_value[1] # This is the text contents
1080
 
1081
  row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})