Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jun 15

Commit

ab04c92

1 Parent(s): c3d1c4c

Updated duplicate pages functionality. Improve redaction efficiency a little with concat method. Minor modification to documentation and interface

Browse files

Files changed (7) hide show

app.py +71 -11
index.qmd +23 -0
tools/aws_functions.py +1 -0
tools/file_conversion.py +5 -7
tools/file_redaction.py +42 -22
tools/find_duplicate_pages.py +292 -76
tools/redaction_review.py +5 -2

app.py CHANGED Viewed

@@ -7,12 +7,12 @@ from tools.helper_functions import put_columns_in_df, get_connection_params, rev
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
-from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
-from tools.find_duplicate_pages import identify_similar_pages
 from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 # Suppress downcasting warnings
@@ -186,6 +186,7 @@ with app:
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
     duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
     # Tracking variables for current page (not visible)
     current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -376,7 +377,8 @@ with app:
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
-                    reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -387,13 +389,47 @@ with app:
     # IDENTIFY DUPLICATE PAGES TAB
     ###
     with gr.Tab(label="Identify duplicate pages"):
-        with gr.Accordion("Identify duplicate pages to redact", open = True):
-            in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
             with gr.Row():
-                duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
-                find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
-            duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
     ###
     # TEXT / TABULAR DATA TAB
@@ -621,7 +657,8 @@ with app:
     # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
     recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
         success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
-        success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page])
     reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
@@ -653,7 +690,10 @@ with app:
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     # Review OCR text button
-    all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row])
     reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
@@ -684,7 +724,27 @@ with app:
     ###
     # IDENTIFY DUPLICATE PAGES
     ###
-    find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages, duplicate_threshold_value, output_folder_textbox], outputs=[duplicate_pages_df, duplicate_pages_out])
     ###
     # SETTINGS PAGE INPUT / OUTPUT

 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
+from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
+from tools.find_duplicate_pages import run_analysis, show_page_previews
 from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 # Suppress downcasting warnings
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
     duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
+    full_data_state = gr.State() # Full data for deduplication process
     # Tracking variables for current page (not visible)
     current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
+                    reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
+                    selected_ocr_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "text":[]}), col_count=2, type="pandas", visible=False, headers=["page", "text"], wrap=True)
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
     # IDENTIFY DUPLICATE PAGES TAB
     ###
     with gr.Tab(label="Identify duplicate pages"):
+        with gr.Accordion("Step 1: Configure and Run Analysis", open = True):
+            in_duplicate_pages = gr.File(
+                label="Upload multiple 'ocr_output.csv' files to compare",
+                file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv']
+            )
+            gr.Markdown("#### Matching Parameters")
+            with gr.Row():
+                duplicate_threshold_input = gr.Number(value=0.95, label="Similarity Threshold", info="Score (0-1) to consider pages a match.")
+                min_word_count_input = gr.Number(value=10, label="Min Word Count", info="Pages with fewer words are ignored.")
+            gr.Markdown("#### Matching Strategy")
+            greedy_match_input = gr.Checkbox(
+                label="Enable 'Greedy' Consecutive Matching",
+                value=False,
+                info="If checked, finds the longest possible sequence of matching pages starting from any single match. Overrides the slider below."
+            )
+            min_consecutive_pages_input = gr.Slider(
+                minimum=1, maximum=20, value=1, step=1,
+                label="Minimum Consecutive Pages (for non-greedy mode)",
+                info="If Greedy Matching is off, use this to find sequences of a fixed minimum length."
+            )
+            find_duplicate_pages_btn = gr.Button(value="Identify Duplicate Pages", variant="primary")
+        with gr.Accordion("Step 2: Review Results", open=True):
+            gr.Markdown("### Analysis Summary\nClick on a row below to see the full page text.")
+            results_df_preview = gr.DataFrame(label="Similarity Results", interactive=True)
+            gr.Markdown("### Full Text Preview")
             with gr.Row():
+                page1_text_preview = gr.DataFrame(label="Match Source (Document 1)")
+                page2_text_preview = gr.DataFrame(label="Match Duplicate (Document 2)")
+            gr.Markdown("### Downloadable Files")
+            duplicate_pages_out = gr.File(
+                label="Download analysis summary and redaction lists (.csv)",
+                file_count="multiple", height=FILE_INPUT_HEIGHT
+            )
+            # Here, it would be good to call the redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, custom_colours:bool, border:float = 5) function, where each call creates a single image annotation box. page_sizes_df could be used here potentially to create size inputs. Maybe a bool could be added to exclude the actual pymupdf page box redaction, so that Page can be put in as a placeholder. The function convert annotation df to review df could then, concat to the existing review df, to update the existing review df with new full page redactions.
     ###
     # TEXT / TABULAR DATA TAB
     # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
     recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
         success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
+        success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page]).\
+        success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
     reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
     # Review OCR text button
+    all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_ocr_dataframe_row]).\
+        success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page]).\
+        success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
     reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     ###
     # IDENTIFY DUPLICATE PAGES
     ###
+    find_duplicate_pages_btn.click(
+        fn=run_analysis,
+        inputs=[
+            in_duplicate_pages,
+            duplicate_threshold_input,
+            min_word_count_input,
+            min_consecutive_pages_input,
+            greedy_match_input
+        ],
+        outputs=[
+            results_df_preview,
+            duplicate_pages_out,
+            full_data_state
+        ]
+    )
+    results_df_preview.select(
+        fn=show_page_previews,
+        inputs=[full_data_state, results_df_preview],
+        outputs=[page1_text_preview, page2_text_preview]
+    )
     ###
     # SETTINGS PAGE INPUT / OUTPUT

index.qmd ADDED Viewed

	@@ -0,0 +1,23 @@

+---
+title: "Home"
+---
+version: 0.7.0
+Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
+Navigate through the sections to learn how to install, use, and manage the application. Below is a brief introduction to the app.
+## Document redaction
+Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](src/user_guide.qmd) for a walkthrough on how to use the app.
+![Handwriting and signatures redacted example](https://raw.githubusercontent.com/seanpedrick-case/document_redaction_examples/refs/heads/main/review_redactions/Signatures%20and%20handwriting%20found.PNG)
+To identify text in documents, the app provides several options. 'Local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. The app then identifies personal information to redaction. The 'Local' is based on spaCy, is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
+After redaction, suggested redactions can be reviewed and modified on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
+NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.

tools/aws_functions.py CHANGED Viewed

@@ -228,5 +228,6 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
             print(final_out_message_str)
     else:
         final_out_message_str = "App not set to run AWS functions"
     return final_out_message_str

             print(final_out_message_str)
     else:
         final_out_message_str = "App not set to run AWS functions"
+        print(final_out_message_str)
     return final_out_message_str

tools/file_conversion.py CHANGED Viewed

@@ -385,24 +385,22 @@ def convert_pymupdf_to_image_coords(pymupdf_page:Page, x1:float, y1:float, x2:fl
     return x1_image, y1_image, x2_image, y2_image
-def redact_whole_pymupdf_page(rect_height:float, rect_width:float, image:Image, page:Page, custom_colours, border:float = 5, image_dimensions:dict={}):
     # Small border to page that remains white
     border = 5
     # Define the coordinates for the Rect
     whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
     whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border  # Top-right corner
-    # whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image, image_dimensions=image_dimensions)
     # Create new image annotation element based on whole page coordinates
     whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
     # Write whole page annotation to annotation boxes
     whole_page_img_annotation_box = {}
-    whole_page_img_annotation_box["xmin"] = whole_page_x1 #whole_page_image_x1
-    whole_page_img_annotation_box["ymin"] = whole_page_y1 #whole_page_image_y1
-    whole_page_img_annotation_box["xmax"] = whole_page_x2 #whole_page_image_x2
-    whole_page_img_annotation_box["ymax"] =  whole_page_y2 #whole_page_image_y2
     whole_page_img_annotation_box["color"] = (0,0,0)
     whole_page_img_annotation_box["label"] = "Whole page"

     return x1_image, y1_image, x2_image, y2_image
+def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, custom_colours:bool, border:float = 5):
     # Small border to page that remains white
     border = 5
     # Define the coordinates for the Rect
     whole_page_x1, whole_page_y1 = 0 + border, 0 + border  # Bottom-left corner
     whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border  # Top-right corner
     # Create new image annotation element based on whole page coordinates
     whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
     # Write whole page annotation to annotation boxes
     whole_page_img_annotation_box = {}
+    whole_page_img_annotation_box["xmin"] = whole_page_x1
+    whole_page_img_annotation_box["ymin"] = whole_page_y1
+    whole_page_img_annotation_box["xmax"] = whole_page_x2
+    whole_page_img_annotation_box["ymax"] =  whole_page_y2
     whole_page_img_annotation_box["color"] = (0,0,0)
     whole_page_img_annotation_box["label"] = "Whole page"

tools/file_redaction.py CHANGED Viewed

@@ -1114,7 +1114,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
     # If whole page is to be redacted, do that here
     if redact_whole_page == True:
-        whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5, image_dimensions=image_dimensions)
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
     out_annotation_boxes = {
@@ -1372,10 +1372,19 @@ def redact_image_pdf(file_path:str,
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
-    progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
-    all_line_level_ocr_results_df_list = [all_line_level_ocr_results_df]
-    all_pages_decision_process_table_list = [all_pages_decision_process_table]
     # Go through each page
     for page_no in progress_bar:
@@ -1525,7 +1534,10 @@ def redact_image_pdf(file_path:str,
                 'height': result.height
             } for result in page_line_level_ocr_results['results']])
-            all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
             if pii_identification_method != NO_REDACTION_PII_OPTION:
                 # Step 2: Analyse text and identify PII
@@ -1637,7 +1649,10 @@ def redact_image_pdf(file_path:str,
                 'page': reported_page_number
             } for result in page_merged_redaction_bboxes])
-            all_pages_decision_process_table_list.append(decision_process_table)
             decision_process_table = fill_missing_ids(decision_process_table)
             decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
@@ -1685,8 +1700,11 @@ def redact_image_pdf(file_path:str,
                     if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                         log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
-                all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-                all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
                 current_loop_page += 1
@@ -1733,9 +1751,11 @@ def redact_image_pdf(file_path:str,
                 if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
-            all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-            all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
             return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
@@ -1758,8 +1778,8 @@ def redact_image_pdf(file_path:str,
         if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
             log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
-    all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
     # Convert decision table and ocr results to relative coordinates
     all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
@@ -2002,11 +2022,11 @@ def redact_text_pdf(
     tic = time.perf_counter()
     if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
-        all_line_level_ocr_results_df_list = [all_line_level_ocr_results_df]
     if isinstance(all_pages_decision_process_table, pd.DataFrame):
         # Convert decision outputs to list of dataframes:
-        all_pages_decision_process_table_list = [all_pages_decision_process_table]
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service not found."
@@ -2133,7 +2153,7 @@ def redact_text_pdf(
                     page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
                     if not page_decision_process_table.empty:
-                        all_pages_decision_process_table_list.append(page_decision_process_table)
                 # Else, user chose not to run redaction
                 else:
@@ -2145,7 +2165,7 @@ def redact_text_pdf(
                 if not page_text_ocr_outputs.empty:
                     page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
                     page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
-                    all_line_level_ocr_results_df_list.append(page_text_ocr_outputs)
                 toc = time.perf_counter()
@@ -2168,8 +2188,8 @@ def redact_text_pdf(
                         annotations_all_pages.append(page_image_annotations)
                     # Write logs
-                    all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-                    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
                     current_loop_page += 1
@@ -2193,16 +2213,16 @@ def redact_text_pdf(
             progress.close(_tqdm=progress_bar)
             # Write logs
-            all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
             return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Write all page outputs
-    all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
-    #print("all_line_level_ocr_results_df_list:", all_line_level_ocr_results_df_list)
-    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
     #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)

     # If whole page is to be redacted, do that here
     if redact_whole_page == True:
+        whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, page, custom_colours, border = 5)
         all_image_annotation_boxes.append(whole_page_img_annotation_box)
     out_annotation_boxes = {
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
+    progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
+    # If there's data from a previous run (passed in via the DataFrame parameters), add it
+    all_line_level_ocr_results_list = []
+    all_pages_decision_process_list = []
+    if not all_line_level_ocr_results_df.empty:
+        all_line_level_ocr_results_list.extend(all_line_level_ocr_results_df.to_dict('records'))
+    if not all_pages_decision_process_table.empty:
+        all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
+    #all_line_level_ocr_results_list = [all_line_level_ocr_results_df.to_dict('records')]#[all_line_level_ocr_results_df]
+    #all_pages_decision_process_list = [all_pages_decision_process_table.to_dict('records')]#[all_pages_decision_process_table]
     # Go through each page
     for page_no in progress_bar:
                 'height': result.height
             } for result in page_line_level_ocr_results['results']])
+            #all_line_level_ocr_results_list.append(line_level_ocr_results_df.to_dict('records'))
+            if not line_level_ocr_results_df.empty: # Ensure there are records to add
+                all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
             if pii_identification_method != NO_REDACTION_PII_OPTION:
                 # Step 2: Analyse text and identify PII
                 'page': reported_page_number
             } for result in page_merged_redaction_bboxes])
+            #all_pages_decision_process_list.append(decision_process_table.to_dict('records'))
+            if not decision_process_table.empty: # Ensure there are records to add
+                all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
             decision_process_table = fill_missing_ids(decision_process_table)
             decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
                     if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                         log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
+                #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+                #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
+                all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
+                all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
                 current_loop_page += 1
                 if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
                     log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
+            #all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+            #all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
+            all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
+            all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
             return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
         if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
             log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
+    all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list) #pd.concat(all_pages_decision_process_list)
+    all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list) #pd.concat(all_line_level_ocr_results_list)
     # Convert decision table and ocr results to relative coordinates
     all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
     tic = time.perf_counter()
     if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
+        all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
     if isinstance(all_pages_decision_process_table, pd.DataFrame):
         # Convert decision outputs to list of dataframes:
+        all_pages_decision_process_list = [all_pages_decision_process_table]
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         out_message = "Connection to AWS Comprehend service not found."
                     page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
                     if not page_decision_process_table.empty:
+                        all_pages_decision_process_list.append(page_decision_process_table)
                 # Else, user chose not to run redaction
                 else:
                 if not page_text_ocr_outputs.empty:
                     page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
                     page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
+                    all_line_level_ocr_results_list.append(page_text_ocr_outputs)
                 toc = time.perf_counter()
                         annotations_all_pages.append(page_image_annotations)
                     # Write logs
+                    all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+                    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
                     current_loop_page += 1
             progress.close(_tqdm=progress_bar)
             # Write logs
+            all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
             return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Write all page outputs
+    all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
+    #print("all_line_level_ocr_results_list:", all_line_level_ocr_results_list)
+    all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
     #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)

tools/find_duplicate_pages.py CHANGED Viewed

@@ -1,32 +1,20 @@
 import pandas as pd
-#import argparse
-#import glob
 import os
 import re
 from tools.helper_functions import OUTPUT_FOLDER
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-# import nltk
-# from nltk.corpus import stopwords
-# from nltk.tokenize import word_tokenize
-# from nltk.stem import PorterStemmer
-#import spacy
-import numpy as np
 import random
 import string
-from typing import List
 from gradio import Progress
-import en_core_web_lg #en_core_web_sm
 nlp = en_core_web_lg.load()
-#from tqdm import tqdm
-# nltk.download('punkt')
-# nltk.download('stopwords')
-# nltk.download('punkt_tab')
-similarity_threshold = 0.9
 def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
     """
@@ -133,89 +121,317 @@ def process_data(df:pd.DataFrame, column:str):
     return df
-def identify_similar_pages(input_files: List[str], similarity_threshold: float = 0.9, output_folder:str=OUTPUT_FOLDER, progress=Progress(track_tqdm=True)):
     output_paths = []
-    progress(0.1, desc="Cleaning input text")
-    # Load and clean data
-    df, output_files = combine_ocr_output_text(input_files)
-    output_paths.extend(output_files)
-    df = process_data(df, 'text')  # Assume this returns 'text_clean', 'file', and 'page' columns
-    # Vectorize text
     vectorizer = TfidfVectorizer()
-    tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
     progress(0.3, desc="Calculating text similarity")
-    # Compute sparse cosine similarity
-    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)  # Keep sparse format
-    # Extract indices of similar pages above threshold
     coo_matrix = similarity_matrix.tocoo()
-    similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])
-    if similar_pages.size == 0:
-        return pd.DataFrame(), output_paths  # Return empty if no matches
-    # Create a DataFrame for similar pairs
-    similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
-    # Remove duplicate pairs (keep one direction)
-    similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
-    progress(0.8, desc="Mapping back results")
-    # Map indices to metadata
-    # index_map = df[['file', 'page', 'text']].to_dict(orient='index')
-    # similarity_df['Page1_File'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['file'])
-    # similarity_df['Page2_File'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['file'])
-    # similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['page'])
-    # similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['page'])
-    # similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(lambda x: index_map[x]['text'][0:200])
-    # similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(lambda x: index_map[x]['text'][0:200])
-    # Create a DataFrame with the metadata
-    metadata_df = df[['file', 'page', 'text']].reset_index()
-    # Merge to get the metadata for Page1
-    similarity_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_on='index', suffixes=('', '_Page1'))
-    similarity_df = similarity_df.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
-    # Merge to get the metadata for Page2
-    similarity_df = similarity_df.merge(metadata_df, left_on='Page2_Index', right_on='index', suffixes=('', '_Page2'))
-    similarity_df = similarity_df.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
-    # Optionally, drop the index columns if not needed
-    #similarity_df = similarity_df.drop(columns=['index_Page1', 'index_Page2'])
-    similarity_df["Similarity_Score"] = similarity_df["Similarity_Score"].round(3)
-    # Sort results
-    similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
-    similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
-    similarity_df_out['Page1_Text'] = similarity_df_out['Page1_Text'][0:100]
-    similarity_df_out['Page2_Text'] = similarity_df_out['Page2_Text'][0:100]
-    progress(0.8, desc="Saving output files")
-    # Save results
-    similarity_file_output_path = output_folder + 'page_similarity_results.csv'
-    similarity_df_out.to_csv(similarity_file_output_path, index=False)
-    output_paths.append(similarity_file_output_path)
-    # Save per-file redaction lists
-    for redact_file in similarity_df_out['Page2_File'].unique():
-        output_file_name = output_folder + redact_file + "_whole_page.csv"
-        whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
-        whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
-        output_paths.append(output_file_name)
-    return similarity_df_out, output_paths
 # Perturb text
 # Apply the perturbation function with a 10% error probability

 import pandas as pd
 import os
 import re
 from tools.helper_functions import OUTPUT_FOLDER
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import random
 import string
+from typing import List, Tuple
+import gradio as gr
 from gradio import Progress
+from pathlib import Path
+import en_core_web_lg
 nlp = en_core_web_lg.load()
+similarity_threshold = 0.95
 def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
     """
     return df
+def map_metadata_single_page(similarity_df, metadata_source_df):
+    """Helper to map metadata for single page results."""
+    metadata_df = metadata_source_df[['file', 'page', 'text']]
+    results_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_index=True)\
+                            .rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
+    results_df = results_df.merge(metadata_df, left_on='Page2_Index', right_index=True, suffixes=('_1', '_2'))\
+                            .rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
+    results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
+    final_df = results_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
+    final_df = final_df.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"])
+    final_df['Page1_Text'] = final_df['Page1_Text'].str[:200]
+    final_df['Page2_Text'] = final_df['Page2_Text'].str[:200]
+    return final_df
+def map_metadata_subdocument(subdocument_df, metadata_source_df):
+    """Helper to map metadata for subdocument results."""
+    metadata_df = metadata_source_df[['file', 'page', 'text']]
+    subdocument_df = subdocument_df.merge(metadata_df, left_on='Page1_Start_Index', right_index=True)\
+                                   .rename(columns={'file': 'Page1_File', 'page': 'Page1_Start_Page', 'text': 'Page1_Text'})
+    subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page1_End_Index', right_index=True)\
+                                   .rename(columns={'page': 'Page1_End_Page'})
+    subdocument_df = subdocument_df.merge(metadata_df, left_on='Page2_Start_Index', right_index=True)\
+                                   .rename(columns={'file': 'Page2_File', 'page': 'Page2_Start_Page', 'text': 'Page2_Text'})
+    subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page2_End_Index', right_index=True)\
+                                   .rename(columns={'page': 'Page2_End_Page'})
+    cols = ['Page1_File', 'Page1_Start_Page', 'Page1_End_Page',
+            'Page2_File', 'Page2_Start_Page', 'Page2_End_Page',
+            'Match_Length', 'Page1_Text', 'Page2_Text']
+    # Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
+    if 'Avg_Similarity' in subdocument_df.columns:
+        subdocument_df['Avg_Similarity'] = subdocument_df['Avg_Similarity'].round(3)
+        cols.insert(7, 'Avg_Similarity')
+    final_df = subdocument_df[cols]
+    final_df = final_df.sort_values(['Page1_File', 'Page1_Start_Page', 'Page2_File', 'Page2_Start_Page'])
+    final_df['Page1_Text'] = final_df['Page1_Text'].str[:200]
+    final_df['Page2_Text'] = final_df['Page2_Text'].str[:200]
+    return final_df
+def identify_similar_pages(
+    df_combined: pd.DataFrame,
+    similarity_threshold: float = 0.9,
+    min_word_count: int = 10,
+    min_consecutive_pages: int = 1,
+    greedy_match: bool = False, # NEW parameter
+    output_folder: str = OUTPUT_FOLDER,
+    progress=Progress(track_tqdm=True)
+) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
+    """
+    Identifies similar pages with three possible strategies:
+    1. Single Page: If greedy_match=False and min_consecutive_pages=1.
+    2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
+    3. Greedy Consecutive Match: If greedy_match=True.
+    """
+    # ... (Initial setup: progress, data loading/processing, word count filter) ...
+    # This part remains the same as before.
     output_paths = []
+    progress(0.1, desc="Processing and filtering text")
+    df = process_data(df_combined, 'text')
+    df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
+    original_row_count = len(df)
+    df_filtered = df[df['word_count'] >= min_word_count].copy()
+    df_filtered.reset_index(drop=True, inplace=True)
+    print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
+    if len(df_filtered) < 2:
+        return pd.DataFrame(), [], df_combined
     vectorizer = TfidfVectorizer()
+    tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
     progress(0.3, desc="Calculating text similarity")
+    similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
     coo_matrix = similarity_matrix.tocoo()
+    # Create a DataFrame of all individual page pairs above the threshold.
+    # This is the base for all three matching strategies.
+    similar_pages = [
+        (r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
+        if r < c and v >= similarity_threshold
+    ]
+    if not similar_pages:
+        return pd.DataFrame(), [], df_combined
+    base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
+    progress(0.6, desc="Aggregating results based on matching strategy")
+    # --- NEW: Logic to select matching strategy ---
+    if greedy_match:
+        # --- STRATEGY 3: Greedy Consecutive Matching ---
+        print("Finding matches using GREEDY consecutive strategy.")
+        # A set of pairs for fast lookups of (page1_idx, page2_idx)
+        valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
+        # Keep track of indices that have been used in a sequence
+        consumed_indices_1 = set()
+        consumed_indices_2 = set()
+        all_sequences = []
+        # Iterate through all potential starting pairs, sorted for consistent results
+        sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
+        for _, row in sorted_pairs.iterrows():
+            start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
+            # If this pair has already been consumed by a previous sequence, skip it
+            if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
+                continue
+            # This is a new sequence, start expanding it
+            current_sequence = [(start_idx1, start_idx2)]
+            k = 1
+            while True:
+                next_idx1 = start_idx1 + k
+                next_idx2 = start_idx2 + k
+                # Check if the next pair in the sequence is a valid match
+                if (next_idx1, next_idx2) in valid_pairs_set and \
+                   next_idx1 not in consumed_indices_1 and \
+                   next_idx2 not in consumed_indices_2:
+                    current_sequence.append((next_idx1, next_idx2))
+                    k += 1
+                else:
+                    # The sequence has ended
+                    break
+            # Record the found sequence and mark all its pages as consumed
+            sequence_indices_1 = [p[0] for p in current_sequence]
+            sequence_indices_2 = [p[1] for p in current_sequence]
+            all_sequences.append({
+                'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
+                'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
+                'Match_Length': len(current_sequence)
+            })
+            consumed_indices_1.update(sequence_indices_1)
+            consumed_indices_2.update(sequence_indices_2)
+        if not all_sequences:
+            return pd.DataFrame(), [], df_combined
+        subdocument_df = pd.DataFrame(all_sequences)
+        # We can add back the average similarity if needed, but it requires more lookups.
+        # For now, we'll omit it for simplicity in the greedy approach.
+        # ... (The rest is metadata mapping, same as the subdocument case)
+    elif min_consecutive_pages > 1:
+        # --- STRATEGY 2: Fixed-Length Subdocument Matching ---
+        print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
+        similarity_df = base_similarity_df.copy()
+        similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
+        is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
+        block_id = is_consecutive.eq(False).cumsum()
+        grouped = similarity_df.groupby(block_id)
+        agg_results = grouped.agg(
+            Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
+            Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
+            Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
+        ).reset_index(drop=True)
+        subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
+        if subdocument_df.empty: return pd.DataFrame(), [], df_combined
+    else:
+        # --- STRATEGY 1: Single Page Matching ---
+        print(f"Finding single page matches (min_consecutive_pages=1)")
+        final_df = map_metadata_single_page(base_similarity_df, df_filtered)
+        # The rest of the logic (saving files) is handled after this if/else block
+        pass # The final_df is already prepared
+    # --- Map metadata and format output ---
+    # This block now handles the output for both subdocument strategies (2 and 3)
+    if greedy_match or min_consecutive_pages > 1:
+        final_df = map_metadata_subdocument(subdocument_df, df_filtered)
+    progress(0.8, desc="Saving output files")
+    # If no matches were found, final_df could be empty.
+    if final_df.empty:
+        print("No matches found, no output files to save.")
+        return final_df, [], df_combined
+    # --- 1. Save the main results DataFrame ---
+    # This file contains the detailed summary of all matches found.
+    similarity_file_output_path = Path(output_folder) / 'page_similarity_results.csv'
+    final_df.to_csv(similarity_file_output_path, index=False)
+    output_paths.append(str(similarity_file_output_path))
+    print(f"Main results saved to {similarity_file_output_path}")
+    # --- 2. Save per-file redaction lists ---
+    # These files contain a simple list of page numbers to redact for each document
+    # that contains duplicate content.
+    # We group by the file containing the duplicates ('Page2_File')
+    for redact_file, group in final_df.groupby('Page2_File'):
+        output_file_name_stem = Path(redact_file).stem
+        output_file_path = Path(output_folder) / f"{output_file_name_stem}_pages_to_redact.csv"
+        all_pages_to_redact = set()
+        # Check if the results are for single pages or subdocuments
+        is_subdocument_match = 'Page2_Start_Page' in group.columns
+        if is_subdocument_match:
+            # For subdocument matches, create a range of pages for each match
+            for _, row in group.iterrows():
+                # Generate all page numbers from the start to the end of the match
+                pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
+                all_pages_to_redact.update(pages_in_range)
+        else:
+            # For single-page matches, just add the page number
+            pages = group['Page2_Page'].unique()
+            all_pages_to_redact.update(pages)
+        if all_pages_to_redact:
+            # Create a DataFrame from the sorted list of pages to redact
+            redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
+            redaction_df.to_csv(output_file_path, header=False, index=False)
+            output_paths.append(str(output_file_path))
+            print(f"Redaction list for {redact_file} saved to {output_file_path}")
+    # Note: The 'combined ocr output' csv was part of the original data loading function,
+    # not the analysis function itself. If you need that, it should be saved within
+    # your `combine_ocr_output_text` function.
+    return final_df, output_paths, df_combined
+# ==============================================================================
+# GRADIO HELPER FUNCTIONS
+# ==============================================================================
+def run_analysis(files, threshold, min_words, min_consecutive, greedy_match, progress=gr.Progress(track_tqdm=True)):
+    """
+    Wrapper function updated to include the 'greedy_match' boolean.
+    """
+    if not files:
+        gr.Warning("Please upload files to analyze.")
+        return None, None, None
+    progress(0, desc="Combining input files...")
+    df_combined, _ = combine_ocr_output_text(files)
+    if df_combined.empty:
+        gr.Warning("No data found in the uploaded files.")
+        return None, None, None
+    # Call the main analysis function with the new parameter
+    results_df, output_paths, full_df = identify_similar_pages(
+        df_combined=df_combined,
+        similarity_threshold=threshold,
+        min_word_count=min_words,
+        min_consecutive_pages=int(min_consecutive),
+        greedy_match=greedy_match, # Pass the new boolean
+        progress=progress
+    )
+    return results_df, output_paths, full_df
+def show_page_previews(full_data, results_df, evt: gr.SelectData):
+    """
+    Triggered when a user selects a row in the results DataFrame.
+    It uses the stored 'full_data' to find and display the complete text.
+    """
+    if full_data is None or results_df is None:
+        return None, None # Return empty dataframes if no analysis has been run
+    selected_row = results_df.iloc[evt.index[0]]
+    # Determine if it's a single page or a multi-page (subdocument) match
+    is_subdocument_match = 'Page1_Start_Page' in selected_row
+    if is_subdocument_match:
+        # --- Handle Subdocument Match ---
+        file1, start1, end1 = selected_row['Page1_File'], selected_row['Page1_Start_Page'], selected_row['Page1_End_Page']
+        file2, start2, end2 = selected_row['Page2_File'], selected_row['Page2_Start_Page'], selected_row['Page2_End_Page']
+        page1_data = full_data[
+            (full_data['file'] == file1) &
+            (full_data['page'].between(start1, end1))
+        ].sort_values('page')[['page', 'text']]
+        page2_data = full_data[
+            (full_data['file'] == file2) &
+            (full_data['page'].between(start2, end2))
+        ].sort_values('page')[['page', 'text']]
+    else:
+        # --- Handle Single Page Match ---
+        file1, page1 = selected_row['Page1_File'], selected_row['Page1_Page']
+        file2, page2 = selected_row['Page2_File'], selected_row['Page2_Page']
+        page1_data = full_data[
+            (full_data['file'] == file1) & (full_data['page'] == page1)
+        ][['page', 'text']]
+        page2_data = full_data[
+            (full_data['file'] == file2) & (full_data['page'] == page2)
+        ][['page', 'text']]
+    return page1_data, page2_data
 # Perturb text
 # Apply the perturbation function with a 10% error probability

tools/redaction_review.py CHANGED Viewed

@@ -1040,9 +1040,12 @@ def reset_dropdowns(df:pd.DataFrame):
     return recogniser_entities_drop, text_entities_drop, page_entities_drop
 def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
-        row_value_page = evt.row_value[0] # This is the page number value
         row_value_label = evt.row_value[1] # This is the label number value
         row_value_text = evt.row_value[2] # This is the text number value
         row_value_id = evt.row_value[3] # This is the text number value
@@ -1072,7 +1075,7 @@ def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
 def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
-        row_value_page = evt.row_value[0] # This is the page_number value
         row_value_text = evt.row_value[1] # This is the text contents
         row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})

     return recogniser_entities_drop, text_entities_drop, page_entities_drop
+def increase_bottom_page_count_based_on_top(page_number:int):
+    return int(page_number)
 def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
+        row_value_page = int(evt.row_value[0]) # This is the page number value
         row_value_label = evt.row_value[1] # This is the label number value
         row_value_text = evt.row_value[2] # This is the text number value
         row_value_id = evt.row_value[3] # This is the text number value
 def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
+        row_value_page = int(evt.row_value[0]) # This is the page_number value
         row_value_text = evt.row_value[1] # This is the text contents
         row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})