Commit
·
ab04c92
1
Parent(s):
c3d1c4c
Updated duplicate pages functionality. Improve redaction efficiency a little with concat method. Minor modification to documentation and interface
Browse files- app.py +71 -11
- index.qmd +23 -0
- tools/aws_functions.py +1 -0
- tools/file_conversion.py +5 -7
- tools/file_redaction.py +42 -22
- tools/find_duplicate_pages.py +292 -76
- tools/redaction_review.py +5 -2
app.py
CHANGED
@@ -7,12 +7,12 @@ from tools.helper_functions import put_columns_in_df, get_connection_params, rev
|
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
10 |
-
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
|
11 |
from tools.data_anonymise import anonymise_data_files
|
12 |
from tools.auth import authenticate_user
|
13 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
14 |
from tools.custom_csvlogger import CSVLogger_custom
|
15 |
-
from tools.find_duplicate_pages import
|
16 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
17 |
|
18 |
# Suppress downcasting warnings
|
@@ -186,6 +186,7 @@ with app:
|
|
186 |
# Duplicate page detection
|
187 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
188 |
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
|
|
|
189 |
|
190 |
# Tracking variables for current page (not visible)
|
191 |
current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
|
@@ -376,7 +377,8 @@ with app:
|
|
376 |
|
377 |
with gr.Accordion("Search all extracted text", open=True):
|
378 |
all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
|
379 |
-
reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
|
|
|
380 |
|
381 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
382 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
@@ -387,13 +389,47 @@ with app:
|
|
387 |
# IDENTIFY DUPLICATE PAGES TAB
|
388 |
###
|
389 |
with gr.Tab(label="Identify duplicate pages"):
|
390 |
-
with gr.Accordion("
|
391 |
-
in_duplicate_pages = gr.File(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
392 |
with gr.Row():
|
393 |
-
|
394 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
|
396 |
-
|
397 |
|
398 |
###
|
399 |
# TEXT / TABULAR DATA TAB
|
@@ -621,7 +657,8 @@ with app:
|
|
621 |
# Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
|
622 |
recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
|
623 |
success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
|
624 |
-
success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page])
|
|
|
625 |
|
626 |
reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
|
627 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
@@ -653,7 +690,10 @@ with app:
|
|
653 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
654 |
|
655 |
# Review OCR text button
|
656 |
-
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page,
|
|
|
|
|
|
|
657 |
reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
658 |
|
659 |
# Convert review file to xfdf Adobe format
|
@@ -684,7 +724,27 @@ with app:
|
|
684 |
###
|
685 |
# IDENTIFY DUPLICATE PAGES
|
686 |
###
|
687 |
-
find_duplicate_pages_btn.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
688 |
|
689 |
###
|
690 |
# SETTINGS PAGE INPUT / OUTPUT
|
|
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
10 |
+
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text, increase_bottom_page_count_based_on_top
|
11 |
from tools.data_anonymise import anonymise_data_files
|
12 |
from tools.auth import authenticate_user
|
13 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
14 |
from tools.custom_csvlogger import CSVLogger_custom
|
15 |
+
from tools.find_duplicate_pages import run_analysis, show_page_previews
|
16 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
17 |
|
18 |
# Suppress downcasting warnings
|
|
|
186 |
# Duplicate page detection
|
187 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
188 |
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
|
189 |
+
full_data_state = gr.State() # Full data for deduplication process
|
190 |
|
191 |
# Tracking variables for current page (not visible)
|
192 |
current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
|
|
|
377 |
|
378 |
with gr.Accordion("Search all extracted text", open=True):
|
379 |
all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
|
380 |
+
reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
|
381 |
+
selected_ocr_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "text":[]}), col_count=2, type="pandas", visible=False, headers=["page", "text"], wrap=True)
|
382 |
|
383 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
384 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
|
|
389 |
# IDENTIFY DUPLICATE PAGES TAB
|
390 |
###
|
391 |
with gr.Tab(label="Identify duplicate pages"):
|
392 |
+
with gr.Accordion("Step 1: Configure and Run Analysis", open = True):
|
393 |
+
in_duplicate_pages = gr.File(
|
394 |
+
label="Upload multiple 'ocr_output.csv' files to compare",
|
395 |
+
file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv']
|
396 |
+
)
|
397 |
+
|
398 |
+
gr.Markdown("#### Matching Parameters")
|
399 |
+
with gr.Row():
|
400 |
+
duplicate_threshold_input = gr.Number(value=0.95, label="Similarity Threshold", info="Score (0-1) to consider pages a match.")
|
401 |
+
min_word_count_input = gr.Number(value=10, label="Min Word Count", info="Pages with fewer words are ignored.")
|
402 |
+
|
403 |
+
gr.Markdown("#### Matching Strategy")
|
404 |
+
greedy_match_input = gr.Checkbox(
|
405 |
+
label="Enable 'Greedy' Consecutive Matching",
|
406 |
+
value=False,
|
407 |
+
info="If checked, finds the longest possible sequence of matching pages starting from any single match. Overrides the slider below."
|
408 |
+
)
|
409 |
+
min_consecutive_pages_input = gr.Slider(
|
410 |
+
minimum=1, maximum=20, value=1, step=1,
|
411 |
+
label="Minimum Consecutive Pages (for non-greedy mode)",
|
412 |
+
info="If Greedy Matching is off, use this to find sequences of a fixed minimum length."
|
413 |
+
)
|
414 |
+
|
415 |
+
find_duplicate_pages_btn = gr.Button(value="Identify Duplicate Pages", variant="primary")
|
416 |
+
|
417 |
+
with gr.Accordion("Step 2: Review Results", open=True):
|
418 |
+
gr.Markdown("### Analysis Summary\nClick on a row below to see the full page text.")
|
419 |
+
results_df_preview = gr.DataFrame(label="Similarity Results", interactive=True)
|
420 |
+
|
421 |
+
gr.Markdown("### Full Text Preview")
|
422 |
with gr.Row():
|
423 |
+
page1_text_preview = gr.DataFrame(label="Match Source (Document 1)")
|
424 |
+
page2_text_preview = gr.DataFrame(label="Match Duplicate (Document 2)")
|
425 |
+
|
426 |
+
gr.Markdown("### Downloadable Files")
|
427 |
+
duplicate_pages_out = gr.File(
|
428 |
+
label="Download analysis summary and redaction lists (.csv)",
|
429 |
+
file_count="multiple", height=FILE_INPUT_HEIGHT
|
430 |
+
)
|
431 |
|
432 |
+
# Here, it would be good to call the redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, custom_colours:bool, border:float = 5) function, where each call creates a single image annotation box. page_sizes_df could be used here potentially to create size inputs. Maybe a bool could be added to exclude the actual pymupdf page box redaction, so that Page can be put in as a placeholder. The function convert annotation df to review df could then, concat to the existing review df, to update the existing review df with new full page redactions.
|
433 |
|
434 |
###
|
435 |
# TEXT / TABULAR DATA TAB
|
|
|
657 |
# Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
|
658 |
recogniser_entity_dataframe.select(df_select_callback_dataframe_row, inputs=[recogniser_entity_dataframe], outputs=[selected_entity_dataframe_row, selected_entity_dataframe_row_text]).\
|
659 |
success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
|
660 |
+
success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page]).\
|
661 |
+
success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
|
662 |
|
663 |
reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
|
664 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
|
|
690 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
691 |
|
692 |
# Review OCR text button
|
693 |
+
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_ocr_dataframe_row]).\
|
694 |
+
success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_state, annotate_previous_page]).\
|
695 |
+
success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
|
696 |
+
|
697 |
reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
698 |
|
699 |
# Convert review file to xfdf Adobe format
|
|
|
724 |
###
|
725 |
# IDENTIFY DUPLICATE PAGES
|
726 |
###
|
727 |
+
find_duplicate_pages_btn.click(
|
728 |
+
fn=run_analysis,
|
729 |
+
inputs=[
|
730 |
+
in_duplicate_pages,
|
731 |
+
duplicate_threshold_input,
|
732 |
+
min_word_count_input,
|
733 |
+
min_consecutive_pages_input,
|
734 |
+
greedy_match_input
|
735 |
+
],
|
736 |
+
outputs=[
|
737 |
+
results_df_preview,
|
738 |
+
duplicate_pages_out,
|
739 |
+
full_data_state
|
740 |
+
]
|
741 |
+
)
|
742 |
+
|
743 |
+
results_df_preview.select(
|
744 |
+
fn=show_page_previews,
|
745 |
+
inputs=[full_data_state, results_df_preview],
|
746 |
+
outputs=[page1_text_preview, page2_text_preview]
|
747 |
+
)
|
748 |
|
749 |
###
|
750 |
# SETTINGS PAGE INPUT / OUTPUT
|
index.qmd
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: "Home"
|
3 |
+
---
|
4 |
+
|
5 |
+
version: 0.7.0
|
6 |
+
|
7 |
+
Welcome to the Document Redaction App documentation. This site provides comprehensive documentation for the Document Redaction App.
|
8 |
+
|
9 |
+
Navigate through the sections to learn how to install, use, and manage the application. Below is a brief introduction to the app.
|
10 |
+
|
11 |
+
## Document redaction
|
12 |
+
|
13 |
+
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](src/user_guide.qmd) for a walkthrough on how to use the app.
|
14 |
+
|
15 |
+

|
16 |
+
|
17 |
+
To identify text in documents, the app provides several options. 'Local' text/OCR image analysis uses spacy/tesseract, and works quite well for documents with typed text. If available, choose 'AWS Textract service' to redact more complex elements e.g. signatures or handwriting. The app then identifies personal information to redaction. The 'Local' is based on spaCy, is quick and gives good results if you are primarily looking for a custom list of terms to redact (see Redaction settings). If available, AWS Comprehend gives better results at a small cost.
|
18 |
+
|
19 |
+
After redaction, suggested redactions can be reviewed and modified on the 'Review redactions' tab. The original pdf can be uploaded here alongside a '...redaction_file.csv' to continue a previous redaction/review task. See the 'Redaction settings' tab to choose which pages to redact, the type of information to redact (e.g. people, places), or custom terms to always include/ exclude from redaction.
|
20 |
+
|
21 |
+
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.
|
22 |
+
|
23 |
+
|
tools/aws_functions.py
CHANGED
@@ -228,5 +228,6 @@ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=
|
|
228 |
print(final_out_message_str)
|
229 |
else:
|
230 |
final_out_message_str = "App not set to run AWS functions"
|
|
|
231 |
|
232 |
return final_out_message_str
|
|
|
228 |
print(final_out_message_str)
|
229 |
else:
|
230 |
final_out_message_str = "App not set to run AWS functions"
|
231 |
+
print(final_out_message_str)
|
232 |
|
233 |
return final_out_message_str
|
tools/file_conversion.py
CHANGED
@@ -385,24 +385,22 @@ def convert_pymupdf_to_image_coords(pymupdf_page:Page, x1:float, y1:float, x2:fl
|
|
385 |
|
386 |
return x1_image, y1_image, x2_image, y2_image
|
387 |
|
388 |
-
def redact_whole_pymupdf_page(rect_height:float, rect_width:float,
|
389 |
# Small border to page that remains white
|
390 |
border = 5
|
391 |
# Define the coordinates for the Rect
|
392 |
whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
393 |
whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
|
394 |
|
395 |
-
# whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image, image_dimensions=image_dimensions)
|
396 |
-
|
397 |
# Create new image annotation element based on whole page coordinates
|
398 |
whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
399 |
|
400 |
# Write whole page annotation to annotation boxes
|
401 |
whole_page_img_annotation_box = {}
|
402 |
-
whole_page_img_annotation_box["xmin"] = whole_page_x1
|
403 |
-
whole_page_img_annotation_box["ymin"] = whole_page_y1
|
404 |
-
whole_page_img_annotation_box["xmax"] = whole_page_x2
|
405 |
-
whole_page_img_annotation_box["ymax"] = whole_page_y2
|
406 |
whole_page_img_annotation_box["color"] = (0,0,0)
|
407 |
whole_page_img_annotation_box["label"] = "Whole page"
|
408 |
|
|
|
385 |
|
386 |
return x1_image, y1_image, x2_image, y2_image
|
387 |
|
388 |
+
def redact_whole_pymupdf_page(rect_height:float, rect_width:float, page:Page, custom_colours:bool, border:float = 5):
|
389 |
# Small border to page that remains white
|
390 |
border = 5
|
391 |
# Define the coordinates for the Rect
|
392 |
whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
393 |
whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
|
394 |
|
|
|
|
|
395 |
# Create new image annotation element based on whole page coordinates
|
396 |
whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
397 |
|
398 |
# Write whole page annotation to annotation boxes
|
399 |
whole_page_img_annotation_box = {}
|
400 |
+
whole_page_img_annotation_box["xmin"] = whole_page_x1
|
401 |
+
whole_page_img_annotation_box["ymin"] = whole_page_y1
|
402 |
+
whole_page_img_annotation_box["xmax"] = whole_page_x2
|
403 |
+
whole_page_img_annotation_box["ymax"] = whole_page_y2
|
404 |
whole_page_img_annotation_box["color"] = (0,0,0)
|
405 |
whole_page_img_annotation_box["label"] = "Whole page"
|
406 |
|
tools/file_redaction.py
CHANGED
@@ -1114,7 +1114,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
1114 |
# If whole page is to be redacted, do that here
|
1115 |
if redact_whole_page == True:
|
1116 |
|
1117 |
-
whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width,
|
1118 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
1119 |
|
1120 |
out_annotation_boxes = {
|
@@ -1372,10 +1372,19 @@ def redact_image_pdf(file_path:str,
|
|
1372 |
if current_loop_page == 0: page_loop_start = 0
|
1373 |
else: page_loop_start = current_loop_page
|
1374 |
|
1375 |
-
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1376 |
|
1377 |
-
|
1378 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1379 |
|
1380 |
# Go through each page
|
1381 |
for page_no in progress_bar:
|
@@ -1525,7 +1534,10 @@ def redact_image_pdf(file_path:str,
|
|
1525 |
'height': result.height
|
1526 |
} for result in page_line_level_ocr_results['results']])
|
1527 |
|
1528 |
-
|
|
|
|
|
|
|
1529 |
|
1530 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
1531 |
# Step 2: Analyse text and identify PII
|
@@ -1637,7 +1649,10 @@ def redact_image_pdf(file_path:str,
|
|
1637 |
'page': reported_page_number
|
1638 |
} for result in page_merged_redaction_bboxes])
|
1639 |
|
1640 |
-
|
|
|
|
|
|
|
1641 |
|
1642 |
decision_process_table = fill_missing_ids(decision_process_table)
|
1643 |
decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
|
@@ -1685,8 +1700,11 @@ def redact_image_pdf(file_path:str,
|
|
1685 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1686 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1687 |
|
1688 |
-
all_pages_decision_process_table = pd.concat(
|
1689 |
-
all_line_level_ocr_results_df = pd.concat(
|
|
|
|
|
|
|
1690 |
|
1691 |
|
1692 |
current_loop_page += 1
|
@@ -1733,9 +1751,11 @@ def redact_image_pdf(file_path:str,
|
|
1733 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1734 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1735 |
|
|
|
|
|
1736 |
|
1737 |
-
all_pages_decision_process_table = pd.
|
1738 |
-
all_line_level_ocr_results_df = pd.
|
1739 |
|
1740 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
1741 |
|
@@ -1758,8 +1778,8 @@ def redact_image_pdf(file_path:str,
|
|
1758 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1759 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1760 |
|
1761 |
-
all_pages_decision_process_table = pd.concat(
|
1762 |
-
all_line_level_ocr_results_df = pd.concat(
|
1763 |
|
1764 |
# Convert decision table and ocr results to relative coordinates
|
1765 |
all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
@@ -2002,11 +2022,11 @@ def redact_text_pdf(
|
|
2002 |
tic = time.perf_counter()
|
2003 |
|
2004 |
if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
|
2005 |
-
|
2006 |
|
2007 |
if isinstance(all_pages_decision_process_table, pd.DataFrame):
|
2008 |
# Convert decision outputs to list of dataframes:
|
2009 |
-
|
2010 |
|
2011 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
2012 |
out_message = "Connection to AWS Comprehend service not found."
|
@@ -2133,7 +2153,7 @@ def redact_text_pdf(
|
|
2133 |
page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
|
2134 |
|
2135 |
if not page_decision_process_table.empty:
|
2136 |
-
|
2137 |
|
2138 |
# Else, user chose not to run redaction
|
2139 |
else:
|
@@ -2145,7 +2165,7 @@ def redact_text_pdf(
|
|
2145 |
if not page_text_ocr_outputs.empty:
|
2146 |
page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
2147 |
page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
|
2148 |
-
|
2149 |
|
2150 |
toc = time.perf_counter()
|
2151 |
|
@@ -2168,8 +2188,8 @@ def redact_text_pdf(
|
|
2168 |
annotations_all_pages.append(page_image_annotations)
|
2169 |
|
2170 |
# Write logs
|
2171 |
-
all_pages_decision_process_table = pd.concat(
|
2172 |
-
all_line_level_ocr_results_df = pd.concat(
|
2173 |
|
2174 |
|
2175 |
current_loop_page += 1
|
@@ -2193,16 +2213,16 @@ def redact_text_pdf(
|
|
2193 |
progress.close(_tqdm=progress_bar)
|
2194 |
|
2195 |
# Write logs
|
2196 |
-
all_pages_decision_process_table = pd.concat(
|
2197 |
|
2198 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
2199 |
|
2200 |
# Write all page outputs
|
2201 |
-
all_pages_decision_process_table = pd.concat(
|
2202 |
|
2203 |
-
#print("
|
2204 |
|
2205 |
-
all_line_level_ocr_results_df = pd.concat(
|
2206 |
|
2207 |
#print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
|
2208 |
|
|
|
1114 |
# If whole page is to be redacted, do that here
|
1115 |
if redact_whole_page == True:
|
1116 |
|
1117 |
+
whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, page, custom_colours, border = 5)
|
1118 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
1119 |
|
1120 |
out_annotation_boxes = {
|
|
|
1372 |
if current_loop_page == 0: page_loop_start = 0
|
1373 |
else: page_loop_start = current_loop_page
|
1374 |
|
1375 |
+
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1376 |
|
1377 |
+
# If there's data from a previous run (passed in via the DataFrame parameters), add it
|
1378 |
+
all_line_level_ocr_results_list = []
|
1379 |
+
all_pages_decision_process_list = []
|
1380 |
+
|
1381 |
+
if not all_line_level_ocr_results_df.empty:
|
1382 |
+
all_line_level_ocr_results_list.extend(all_line_level_ocr_results_df.to_dict('records'))
|
1383 |
+
if not all_pages_decision_process_table.empty:
|
1384 |
+
all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
|
1385 |
+
|
1386 |
+
#all_line_level_ocr_results_list = [all_line_level_ocr_results_df.to_dict('records')]#[all_line_level_ocr_results_df]
|
1387 |
+
#all_pages_decision_process_list = [all_pages_decision_process_table.to_dict('records')]#[all_pages_decision_process_table]
|
1388 |
|
1389 |
# Go through each page
|
1390 |
for page_no in progress_bar:
|
|
|
1534 |
'height': result.height
|
1535 |
} for result in page_line_level_ocr_results['results']])
|
1536 |
|
1537 |
+
#all_line_level_ocr_results_list.append(line_level_ocr_results_df.to_dict('records'))
|
1538 |
+
|
1539 |
+
if not line_level_ocr_results_df.empty: # Ensure there are records to add
|
1540 |
+
all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
|
1541 |
|
1542 |
if pii_identification_method != NO_REDACTION_PII_OPTION:
|
1543 |
# Step 2: Analyse text and identify PII
|
|
|
1649 |
'page': reported_page_number
|
1650 |
} for result in page_merged_redaction_bboxes])
|
1651 |
|
1652 |
+
#all_pages_decision_process_list.append(decision_process_table.to_dict('records'))
|
1653 |
+
|
1654 |
+
if not decision_process_table.empty: # Ensure there are records to add
|
1655 |
+
all_pages_decision_process_list.extend(decision_process_table.to_dict('records'))
|
1656 |
|
1657 |
decision_process_table = fill_missing_ids(decision_process_table)
|
1658 |
decision_process_table.to_csv(output_folder + "decision_process_table_with_ids.csv")
|
|
|
1700 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1701 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1702 |
|
1703 |
+
#all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
1704 |
+
#all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
1705 |
+
|
1706 |
+
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
|
1707 |
+
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
|
1708 |
|
1709 |
|
1710 |
current_loop_page += 1
|
|
|
1751 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1752 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1753 |
|
1754 |
+
#all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
1755 |
+
#all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
1756 |
|
1757 |
+
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
|
1758 |
+
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
|
1759 |
|
1760 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
1761 |
|
|
|
1778 |
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1779 |
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1780 |
|
1781 |
+
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list) #pd.concat(all_pages_decision_process_list)
|
1782 |
+
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list) #pd.concat(all_line_level_ocr_results_list)
|
1783 |
|
1784 |
# Convert decision table and ocr results to relative coordinates
|
1785 |
all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
|
|
2022 |
tic = time.perf_counter()
|
2023 |
|
2024 |
if isinstance(all_line_level_ocr_results_df, pd.DataFrame):
|
2025 |
+
all_line_level_ocr_results_list = [all_line_level_ocr_results_df]
|
2026 |
|
2027 |
if isinstance(all_pages_decision_process_table, pd.DataFrame):
|
2028 |
# Convert decision outputs to list of dataframes:
|
2029 |
+
all_pages_decision_process_list = [all_pages_decision_process_table]
|
2030 |
|
2031 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
2032 |
out_message = "Connection to AWS Comprehend service not found."
|
|
|
2153 |
page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
|
2154 |
|
2155 |
if not page_decision_process_table.empty:
|
2156 |
+
all_pages_decision_process_list.append(page_decision_process_table)
|
2157 |
|
2158 |
# Else, user chose not to run redaction
|
2159 |
else:
|
|
|
2165 |
if not page_text_ocr_outputs.empty:
|
2166 |
page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
2167 |
page_text_ocr_outputs = page_text_ocr_outputs.loc[:, ["page", "text", "left", "top", "width", "height"]]
|
2168 |
+
all_line_level_ocr_results_list.append(page_text_ocr_outputs)
|
2169 |
|
2170 |
toc = time.perf_counter()
|
2171 |
|
|
|
2188 |
annotations_all_pages.append(page_image_annotations)
|
2189 |
|
2190 |
# Write logs
|
2191 |
+
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
2192 |
+
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
2193 |
|
2194 |
|
2195 |
current_loop_page += 1
|
|
|
2213 |
progress.close(_tqdm=progress_bar)
|
2214 |
|
2215 |
# Write logs
|
2216 |
+
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
2217 |
|
2218 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
2219 |
|
2220 |
# Write all page outputs
|
2221 |
+
all_pages_decision_process_table = pd.concat(all_pages_decision_process_list)
|
2222 |
|
2223 |
+
#print("all_line_level_ocr_results_list:", all_line_level_ocr_results_list)
|
2224 |
|
2225 |
+
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_list)
|
2226 |
|
2227 |
#print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
|
2228 |
|
tools/find_duplicate_pages.py
CHANGED
@@ -1,32 +1,20 @@
|
|
1 |
import pandas as pd
|
2 |
-
#import argparse
|
3 |
-
#import glob
|
4 |
import os
|
5 |
import re
|
6 |
from tools.helper_functions import OUTPUT_FOLDER
|
7 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
-
# import nltk
|
10 |
-
# from nltk.corpus import stopwords
|
11 |
-
# from nltk.tokenize import word_tokenize
|
12 |
-
# from nltk.stem import PorterStemmer
|
13 |
-
#import spacy
|
14 |
-
import numpy as np
|
15 |
import random
|
16 |
import string
|
17 |
-
from typing import List
|
|
|
18 |
from gradio import Progress
|
|
|
19 |
|
20 |
-
import en_core_web_lg
|
21 |
nlp = en_core_web_lg.load()
|
22 |
-
#from tqdm import tqdm
|
23 |
-
|
24 |
-
# nltk.download('punkt')
|
25 |
-
# nltk.download('stopwords')
|
26 |
-
# nltk.download('punkt_tab')
|
27 |
-
|
28 |
-
similarity_threshold = 0.9
|
29 |
|
|
|
30 |
|
31 |
def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
|
32 |
"""
|
@@ -133,89 +121,317 @@ def process_data(df:pd.DataFrame, column:str):
|
|
133 |
|
134 |
return df
|
135 |
|
136 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
output_paths = []
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
-
|
140 |
-
|
141 |
-
# Load and clean data
|
142 |
-
df, output_files = combine_ocr_output_text(input_files)
|
143 |
-
output_paths.extend(output_files)
|
144 |
-
df = process_data(df, 'text') # Assume this returns 'text_clean', 'file', and 'page' columns
|
145 |
|
146 |
-
|
|
|
|
|
147 |
vectorizer = TfidfVectorizer()
|
148 |
-
tfidf_matrix = vectorizer.fit_transform(
|
149 |
|
150 |
progress(0.3, desc="Calculating text similarity")
|
151 |
-
|
152 |
-
# Compute sparse cosine similarity
|
153 |
-
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False) # Keep sparse format
|
154 |
-
|
155 |
-
# Extract indices of similar pages above threshold
|
156 |
coo_matrix = similarity_matrix.tocoo()
|
157 |
-
similar_pages = np.array([(i, j, v) for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data) if v > similarity_threshold])
|
158 |
-
|
159 |
-
if similar_pages.size == 0:
|
160 |
-
return pd.DataFrame(), output_paths # Return empty if no matches
|
161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
|
|
163 |
|
164 |
-
|
165 |
-
similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
166 |
|
167 |
-
#
|
168 |
-
similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
|
169 |
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
|
|
179 |
|
180 |
-
|
181 |
-
|
182 |
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
-
#
|
192 |
-
#
|
|
|
193 |
|
|
|
194 |
|
195 |
-
|
|
|
|
|
196 |
|
197 |
-
|
198 |
-
|
199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
|
201 |
-
|
202 |
-
|
|
|
|
|
|
|
|
|
|
|
203 |
|
204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
output_paths.append(similarity_file_output_path)
|
210 |
|
211 |
-
|
212 |
-
for redact_file in similarity_df_out['Page2_File'].unique():
|
213 |
-
output_file_name = output_folder + redact_file + "_whole_page.csv"
|
214 |
-
whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File'] == redact_file, ['Page2_Page']].drop_duplicates(['Page2_Page']).sort_values('Page2_Page')
|
215 |
-
whole_pages_to_redact_df.to_csv(output_file_name, header=False, index=False)
|
216 |
-
output_paths.append(output_file_name)
|
217 |
|
218 |
-
return similarity_df_out, output_paths
|
219 |
|
220 |
# Perturb text
|
221 |
# Apply the perturbation function with a 10% error probability
|
|
|
1 |
import pandas as pd
|
|
|
|
|
2 |
import os
|
3 |
import re
|
4 |
from tools.helper_functions import OUTPUT_FOLDER
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
import random
|
8 |
import string
|
9 |
+
from typing import List, Tuple
|
10 |
+
import gradio as gr
|
11 |
from gradio import Progress
|
12 |
+
from pathlib import Path
|
13 |
|
14 |
+
import en_core_web_lg
|
15 |
nlp = en_core_web_lg.load()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
+
similarity_threshold = 0.95
|
18 |
|
19 |
def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
|
20 |
"""
|
|
|
121 |
|
122 |
return df
|
123 |
|
124 |
+
def map_metadata_single_page(similarity_df, metadata_source_df):
|
125 |
+
"""Helper to map metadata for single page results."""
|
126 |
+
metadata_df = metadata_source_df[['file', 'page', 'text']]
|
127 |
+
results_df = similarity_df.merge(metadata_df, left_on='Page1_Index', right_index=True)\
|
128 |
+
.rename(columns={'file': 'Page1_File', 'page': 'Page1_Page', 'text': 'Page1_Text'})
|
129 |
+
results_df = results_df.merge(metadata_df, left_on='Page2_Index', right_index=True, suffixes=('_1', '_2'))\
|
130 |
+
.rename(columns={'file': 'Page2_File', 'page': 'Page2_Page', 'text': 'Page2_Text'})
|
131 |
+
results_df["Similarity_Score"] = results_df["Similarity_Score"].round(3)
|
132 |
+
final_df = results_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
|
133 |
+
final_df = final_df.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page"])
|
134 |
+
final_df['Page1_Text'] = final_df['Page1_Text'].str[:200]
|
135 |
+
final_df['Page2_Text'] = final_df['Page2_Text'].str[:200]
|
136 |
+
return final_df
|
137 |
+
|
138 |
+
|
139 |
+
def map_metadata_subdocument(subdocument_df, metadata_source_df):
|
140 |
+
"""Helper to map metadata for subdocument results."""
|
141 |
+
metadata_df = metadata_source_df[['file', 'page', 'text']]
|
142 |
+
|
143 |
+
subdocument_df = subdocument_df.merge(metadata_df, left_on='Page1_Start_Index', right_index=True)\
|
144 |
+
.rename(columns={'file': 'Page1_File', 'page': 'Page1_Start_Page', 'text': 'Page1_Text'})
|
145 |
+
subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page1_End_Index', right_index=True)\
|
146 |
+
.rename(columns={'page': 'Page1_End_Page'})
|
147 |
+
subdocument_df = subdocument_df.merge(metadata_df, left_on='Page2_Start_Index', right_index=True)\
|
148 |
+
.rename(columns={'file': 'Page2_File', 'page': 'Page2_Start_Page', 'text': 'Page2_Text'})
|
149 |
+
subdocument_df = subdocument_df.merge(metadata_df[['page']], left_on='Page2_End_Index', right_index=True)\
|
150 |
+
.rename(columns={'page': 'Page2_End_Page'})
|
151 |
+
|
152 |
+
cols = ['Page1_File', 'Page1_Start_Page', 'Page1_End_Page',
|
153 |
+
'Page2_File', 'Page2_Start_Page', 'Page2_End_Page',
|
154 |
+
'Match_Length', 'Page1_Text', 'Page2_Text']
|
155 |
+
|
156 |
+
# Add Avg_Similarity if it exists (it won't for greedy match unless we add it)
|
157 |
+
if 'Avg_Similarity' in subdocument_df.columns:
|
158 |
+
subdocument_df['Avg_Similarity'] = subdocument_df['Avg_Similarity'].round(3)
|
159 |
+
cols.insert(7, 'Avg_Similarity')
|
160 |
+
|
161 |
+
final_df = subdocument_df[cols]
|
162 |
+
final_df = final_df.sort_values(['Page1_File', 'Page1_Start_Page', 'Page2_File', 'Page2_Start_Page'])
|
163 |
+
final_df['Page1_Text'] = final_df['Page1_Text'].str[:200]
|
164 |
+
final_df['Page2_Text'] = final_df['Page2_Text'].str[:200]
|
165 |
+
return final_df
|
166 |
+
|
167 |
+
def identify_similar_pages(
|
168 |
+
df_combined: pd.DataFrame,
|
169 |
+
similarity_threshold: float = 0.9,
|
170 |
+
min_word_count: int = 10,
|
171 |
+
min_consecutive_pages: int = 1,
|
172 |
+
greedy_match: bool = False, # NEW parameter
|
173 |
+
output_folder: str = OUTPUT_FOLDER,
|
174 |
+
progress=Progress(track_tqdm=True)
|
175 |
+
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
176 |
+
"""
|
177 |
+
Identifies similar pages with three possible strategies:
|
178 |
+
1. Single Page: If greedy_match=False and min_consecutive_pages=1.
|
179 |
+
2. Fixed-Length Subdocument: If greedy_match=False and min_consecutive_pages > 1.
|
180 |
+
3. Greedy Consecutive Match: If greedy_match=True.
|
181 |
+
"""
|
182 |
+
# ... (Initial setup: progress, data loading/processing, word count filter) ...
|
183 |
+
# This part remains the same as before.
|
184 |
output_paths = []
|
185 |
+
progress(0.1, desc="Processing and filtering text")
|
186 |
+
df = process_data(df_combined, 'text')
|
187 |
+
df['word_count'] = df['text_clean'].str.split().str.len().fillna(0)
|
188 |
+
original_row_count = len(df)
|
189 |
+
df_filtered = df[df['word_count'] >= min_word_count].copy()
|
190 |
+
df_filtered.reset_index(drop=True, inplace=True)
|
191 |
|
192 |
+
print(f"Filtered out {original_row_count - len(df_filtered)} pages with fewer than {min_word_count} words.")
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
+
if len(df_filtered) < 2:
|
195 |
+
return pd.DataFrame(), [], df_combined
|
196 |
+
|
197 |
vectorizer = TfidfVectorizer()
|
198 |
+
tfidf_matrix = vectorizer.fit_transform(df_filtered['text_clean'])
|
199 |
|
200 |
progress(0.3, desc="Calculating text similarity")
|
201 |
+
similarity_matrix = cosine_similarity(tfidf_matrix, dense_output=False)
|
|
|
|
|
|
|
|
|
202 |
coo_matrix = similarity_matrix.tocoo()
|
|
|
|
|
|
|
|
|
203 |
|
204 |
+
# Create a DataFrame of all individual page pairs above the threshold.
|
205 |
+
# This is the base for all three matching strategies.
|
206 |
+
similar_pages = [
|
207 |
+
(r, c, v) for r, c, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data)
|
208 |
+
if r < c and v >= similarity_threshold
|
209 |
+
]
|
210 |
+
|
211 |
+
if not similar_pages:
|
212 |
+
return pd.DataFrame(), [], df_combined
|
213 |
|
214 |
+
base_similarity_df = pd.DataFrame(similar_pages, columns=['Page1_Index', 'Page2_Index', 'Similarity_Score'])
|
215 |
|
216 |
+
progress(0.6, desc="Aggregating results based on matching strategy")
|
|
|
217 |
|
218 |
+
# --- NEW: Logic to select matching strategy ---
|
|
|
219 |
|
220 |
+
if greedy_match:
|
221 |
+
# --- STRATEGY 3: Greedy Consecutive Matching ---
|
222 |
+
print("Finding matches using GREEDY consecutive strategy.")
|
223 |
+
|
224 |
+
# A set of pairs for fast lookups of (page1_idx, page2_idx)
|
225 |
+
valid_pairs_set = set(zip(base_similarity_df['Page1_Index'], base_similarity_df['Page2_Index']))
|
226 |
+
|
227 |
+
# Keep track of indices that have been used in a sequence
|
228 |
+
consumed_indices_1 = set()
|
229 |
+
consumed_indices_2 = set()
|
230 |
+
|
231 |
+
all_sequences = []
|
232 |
|
233 |
+
# Iterate through all potential starting pairs, sorted for consistent results
|
234 |
+
sorted_pairs = base_similarity_df.sort_values(['Page1_Index', 'Page2_Index'])
|
235 |
|
236 |
+
for _, row in sorted_pairs.iterrows():
|
237 |
+
start_idx1, start_idx2 = int(row['Page1_Index']), int(row['Page2_Index'])
|
238 |
+
|
239 |
+
# If this pair has already been consumed by a previous sequence, skip it
|
240 |
+
if start_idx1 in consumed_indices_1 or start_idx2 in consumed_indices_2:
|
241 |
+
continue
|
242 |
+
|
243 |
+
# This is a new sequence, start expanding it
|
244 |
+
current_sequence = [(start_idx1, start_idx2)]
|
245 |
+
k = 1
|
246 |
+
while True:
|
247 |
+
next_idx1 = start_idx1 + k
|
248 |
+
next_idx2 = start_idx2 + k
|
249 |
+
|
250 |
+
# Check if the next pair in the sequence is a valid match
|
251 |
+
if (next_idx1, next_idx2) in valid_pairs_set and \
|
252 |
+
next_idx1 not in consumed_indices_1 and \
|
253 |
+
next_idx2 not in consumed_indices_2:
|
254 |
+
current_sequence.append((next_idx1, next_idx2))
|
255 |
+
k += 1
|
256 |
+
else:
|
257 |
+
# The sequence has ended
|
258 |
+
break
|
259 |
+
|
260 |
+
# Record the found sequence and mark all its pages as consumed
|
261 |
+
sequence_indices_1 = [p[0] for p in current_sequence]
|
262 |
+
sequence_indices_2 = [p[1] for p in current_sequence]
|
263 |
+
|
264 |
+
all_sequences.append({
|
265 |
+
'Page1_Start_Index': sequence_indices_1[0], 'Page1_End_Index': sequence_indices_1[-1],
|
266 |
+
'Page2_Start_Index': sequence_indices_2[0], 'Page2_End_Index': sequence_indices_2[-1],
|
267 |
+
'Match_Length': len(current_sequence)
|
268 |
+
})
|
269 |
+
|
270 |
+
consumed_indices_1.update(sequence_indices_1)
|
271 |
+
consumed_indices_2.update(sequence_indices_2)
|
272 |
+
|
273 |
+
if not all_sequences:
|
274 |
+
return pd.DataFrame(), [], df_combined
|
275 |
+
|
276 |
+
subdocument_df = pd.DataFrame(all_sequences)
|
277 |
+
# We can add back the average similarity if needed, but it requires more lookups.
|
278 |
+
# For now, we'll omit it for simplicity in the greedy approach.
|
279 |
+
# ... (The rest is metadata mapping, same as the subdocument case)
|
280 |
+
|
281 |
+
elif min_consecutive_pages > 1:
|
282 |
+
# --- STRATEGY 2: Fixed-Length Subdocument Matching ---
|
283 |
+
print(f"Finding consecutive page matches (min_consecutive_pages > 1)")
|
284 |
+
similarity_df = base_similarity_df.copy()
|
285 |
+
similarity_df.sort_values(['Page1_Index', 'Page2_Index'], inplace=True)
|
286 |
+
is_consecutive = (similarity_df['Page1_Index'].diff() == 1) & (similarity_df['Page2_Index'].diff() == 1)
|
287 |
+
block_id = is_consecutive.eq(False).cumsum()
|
288 |
+
grouped = similarity_df.groupby(block_id)
|
289 |
+
agg_results = grouped.agg(
|
290 |
+
Page1_Start_Index=('Page1_Index', 'first'), Page2_Start_Index=('Page2_Index', 'first'),
|
291 |
+
Page1_End_Index=('Page1_Index', 'last'), Page2_End_Index=('Page2_Index', 'last'),
|
292 |
+
Match_Length=('Page1_Index', 'size'), Avg_Similarity=('Similarity_Score', 'mean')
|
293 |
+
).reset_index(drop=True)
|
294 |
+
subdocument_df = agg_results[agg_results['Match_Length'] >= min_consecutive_pages].copy()
|
295 |
+
if subdocument_df.empty: return pd.DataFrame(), [], df_combined
|
296 |
|
297 |
+
else:
|
298 |
+
# --- STRATEGY 1: Single Page Matching ---
|
299 |
+
print(f"Finding single page matches (min_consecutive_pages=1)")
|
300 |
+
final_df = map_metadata_single_page(base_similarity_df, df_filtered)
|
301 |
+
# The rest of the logic (saving files) is handled after this if/else block
|
302 |
+
pass # The final_df is already prepared
|
303 |
+
|
304 |
+
# --- Map metadata and format output ---
|
305 |
+
# This block now handles the output for both subdocument strategies (2 and 3)
|
306 |
+
if greedy_match or min_consecutive_pages > 1:
|
307 |
+
final_df = map_metadata_subdocument(subdocument_df, df_filtered)
|
308 |
+
|
309 |
+
progress(0.8, desc="Saving output files")
|
310 |
+
|
311 |
+
# If no matches were found, final_df could be empty.
|
312 |
+
if final_df.empty:
|
313 |
+
print("No matches found, no output files to save.")
|
314 |
+
return final_df, [], df_combined
|
315 |
+
|
316 |
+
# --- 1. Save the main results DataFrame ---
|
317 |
+
# This file contains the detailed summary of all matches found.
|
318 |
+
similarity_file_output_path = Path(output_folder) / 'page_similarity_results.csv'
|
319 |
+
final_df.to_csv(similarity_file_output_path, index=False)
|
320 |
+
output_paths.append(str(similarity_file_output_path))
|
321 |
+
print(f"Main results saved to {similarity_file_output_path}")
|
322 |
+
|
323 |
+
# --- 2. Save per-file redaction lists ---
|
324 |
+
# These files contain a simple list of page numbers to redact for each document
|
325 |
+
# that contains duplicate content.
|
326 |
+
|
327 |
+
# We group by the file containing the duplicates ('Page2_File')
|
328 |
+
for redact_file, group in final_df.groupby('Page2_File'):
|
329 |
+
output_file_name_stem = Path(redact_file).stem
|
330 |
+
output_file_path = Path(output_folder) / f"{output_file_name_stem}_pages_to_redact.csv"
|
331 |
+
|
332 |
+
all_pages_to_redact = set()
|
333 |
+
|
334 |
+
# Check if the results are for single pages or subdocuments
|
335 |
+
is_subdocument_match = 'Page2_Start_Page' in group.columns
|
336 |
+
|
337 |
+
if is_subdocument_match:
|
338 |
+
# For subdocument matches, create a range of pages for each match
|
339 |
+
for _, row in group.iterrows():
|
340 |
+
# Generate all page numbers from the start to the end of the match
|
341 |
+
pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
|
342 |
+
all_pages_to_redact.update(pages_in_range)
|
343 |
+
else:
|
344 |
+
# For single-page matches, just add the page number
|
345 |
+
pages = group['Page2_Page'].unique()
|
346 |
+
all_pages_to_redact.update(pages)
|
347 |
+
|
348 |
+
if all_pages_to_redact:
|
349 |
+
# Create a DataFrame from the sorted list of pages to redact
|
350 |
+
redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
|
351 |
+
redaction_df.to_csv(output_file_path, header=False, index=False)
|
352 |
+
output_paths.append(str(output_file_path))
|
353 |
+
print(f"Redaction list for {redact_file} saved to {output_file_path}")
|
354 |
|
355 |
+
# Note: The 'combined ocr output' csv was part of the original data loading function,
|
356 |
+
# not the analysis function itself. If you need that, it should be saved within
|
357 |
+
# your `combine_ocr_output_text` function.
|
358 |
|
359 |
+
return final_df, output_paths, df_combined
|
360 |
|
361 |
+
# ==============================================================================
|
362 |
+
# GRADIO HELPER FUNCTIONS
|
363 |
+
# ==============================================================================
|
364 |
|
365 |
+
def run_analysis(files, threshold, min_words, min_consecutive, greedy_match, progress=gr.Progress(track_tqdm=True)):
|
366 |
+
"""
|
367 |
+
Wrapper function updated to include the 'greedy_match' boolean.
|
368 |
+
"""
|
369 |
+
if not files:
|
370 |
+
gr.Warning("Please upload files to analyze.")
|
371 |
+
return None, None, None
|
372 |
+
|
373 |
+
progress(0, desc="Combining input files...")
|
374 |
+
df_combined, _ = combine_ocr_output_text(files)
|
375 |
+
|
376 |
+
if df_combined.empty:
|
377 |
+
gr.Warning("No data found in the uploaded files.")
|
378 |
+
return None, None, None
|
379 |
+
|
380 |
+
# Call the main analysis function with the new parameter
|
381 |
+
results_df, output_paths, full_df = identify_similar_pages(
|
382 |
+
df_combined=df_combined,
|
383 |
+
similarity_threshold=threshold,
|
384 |
+
min_word_count=min_words,
|
385 |
+
min_consecutive_pages=int(min_consecutive),
|
386 |
+
greedy_match=greedy_match, # Pass the new boolean
|
387 |
+
progress=progress
|
388 |
+
)
|
389 |
+
|
390 |
+
return results_df, output_paths, full_df
|
391 |
|
392 |
+
def show_page_previews(full_data, results_df, evt: gr.SelectData):
|
393 |
+
"""
|
394 |
+
Triggered when a user selects a row in the results DataFrame.
|
395 |
+
It uses the stored 'full_data' to find and display the complete text.
|
396 |
+
"""
|
397 |
+
if full_data is None or results_df is None:
|
398 |
+
return None, None # Return empty dataframes if no analysis has been run
|
399 |
|
400 |
+
selected_row = results_df.iloc[evt.index[0]]
|
401 |
+
|
402 |
+
# Determine if it's a single page or a multi-page (subdocument) match
|
403 |
+
is_subdocument_match = 'Page1_Start_Page' in selected_row
|
404 |
+
|
405 |
+
if is_subdocument_match:
|
406 |
+
# --- Handle Subdocument Match ---
|
407 |
+
file1, start1, end1 = selected_row['Page1_File'], selected_row['Page1_Start_Page'], selected_row['Page1_End_Page']
|
408 |
+
file2, start2, end2 = selected_row['Page2_File'], selected_row['Page2_Start_Page'], selected_row['Page2_End_Page']
|
409 |
+
|
410 |
+
page1_data = full_data[
|
411 |
+
(full_data['file'] == file1) &
|
412 |
+
(full_data['page'].between(start1, end1))
|
413 |
+
].sort_values('page')[['page', 'text']]
|
414 |
+
|
415 |
+
page2_data = full_data[
|
416 |
+
(full_data['file'] == file2) &
|
417 |
+
(full_data['page'].between(start2, end2))
|
418 |
+
].sort_values('page')[['page', 'text']]
|
419 |
+
|
420 |
+
else:
|
421 |
+
# --- Handle Single Page Match ---
|
422 |
+
file1, page1 = selected_row['Page1_File'], selected_row['Page1_Page']
|
423 |
+
file2, page2 = selected_row['Page2_File'], selected_row['Page2_Page']
|
424 |
+
|
425 |
+
page1_data = full_data[
|
426 |
+
(full_data['file'] == file1) & (full_data['page'] == page1)
|
427 |
+
][['page', 'text']]
|
428 |
|
429 |
+
page2_data = full_data[
|
430 |
+
(full_data['file'] == file2) & (full_data['page'] == page2)
|
431 |
+
][['page', 'text']]
|
|
|
432 |
|
433 |
+
return page1_data, page2_data
|
|
|
|
|
|
|
|
|
|
|
434 |
|
|
|
435 |
|
436 |
# Perturb text
|
437 |
# Apply the perturbation function with a 10% error probability
|
tools/redaction_review.py
CHANGED
@@ -1040,9 +1040,12 @@ def reset_dropdowns(df:pd.DataFrame):
|
|
1040 |
|
1041 |
return recogniser_entities_drop, text_entities_drop, page_entities_drop
|
1042 |
|
|
|
|
|
|
|
1043 |
def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
|
1044 |
|
1045 |
-
row_value_page = evt.row_value[0] # This is the page number value
|
1046 |
row_value_label = evt.row_value[1] # This is the label number value
|
1047 |
row_value_text = evt.row_value[2] # This is the text number value
|
1048 |
row_value_id = evt.row_value[3] # This is the text number value
|
@@ -1072,7 +1075,7 @@ def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
|
|
1072 |
|
1073 |
def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
|
1074 |
|
1075 |
-
row_value_page = evt.row_value[0] # This is the page_number value
|
1076 |
row_value_text = evt.row_value[1] # This is the text contents
|
1077 |
|
1078 |
row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
|
|
|
1040 |
|
1041 |
return recogniser_entities_drop, text_entities_drop, page_entities_drop
|
1042 |
|
1043 |
+
def increase_bottom_page_count_based_on_top(page_number:int):
|
1044 |
+
return int(page_number)
|
1045 |
+
|
1046 |
def df_select_callback_dataframe_row(df: pd.DataFrame, evt: gr.SelectData):
|
1047 |
|
1048 |
+
row_value_page = int(evt.row_value[0]) # This is the page number value
|
1049 |
row_value_label = evt.row_value[1] # This is the label number value
|
1050 |
row_value_text = evt.row_value[2] # This is the text number value
|
1051 |
row_value_id = evt.row_value[3] # This is the text number value
|
|
|
1075 |
|
1076 |
def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
|
1077 |
|
1078 |
+
row_value_page = int(evt.row_value[0]) # This is the page_number value
|
1079 |
row_value_text = evt.row_value[1] # This is the text contents
|
1080 |
|
1081 |
row_value_df = pd.DataFrame(data={"page":[row_value_page], "text":[row_value_text]})
|