Commit
·
c8ffcd4
1
Parent(s):
e424038
Further updates to line level duplicate identification
Browse files- app.py +38 -21
- tools/config.py +11 -34
- tools/file_conversion.py +71 -2
- tools/file_redaction.py +1 -21
- tools/find_duplicate_pages.py +409 -100
- tools/helper_functions.py +10 -0
- tools/redaction_review.py +0 -1
app.py
CHANGED
@@ -2,8 +2,8 @@ import os
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME
|
6 |
-
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
@@ -12,12 +12,22 @@ from tools.data_anonymise import anonymise_data_files
|
|
12 |
from tools.auth import authenticate_user
|
13 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
14 |
from tools.custom_csvlogger import CSVLogger_custom
|
15 |
-
from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list
|
16 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
17 |
|
18 |
# Suppress downcasting warnings
|
19 |
pd.set_option('future.no_silent_downcasting', True)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Convert string environment variables to string or list
|
22 |
if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
|
23 |
else: SAVE_LOGS_TO_CSV = False
|
@@ -89,15 +99,15 @@ with app:
|
|
89 |
# Backup versions of these objects in case you make a mistake
|
90 |
backup_review_state = gr.Dataframe(visible=False)
|
91 |
backup_image_annotations_state = gr.State([])
|
92 |
-
backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
|
93 |
|
94 |
# Logging variables
|
95 |
access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
|
96 |
-
access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=
|
97 |
feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
|
98 |
-
feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=
|
99 |
usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
|
100 |
-
usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=
|
101 |
|
102 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
103 |
textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
|
@@ -164,7 +174,7 @@ with app:
|
|
164 |
|
165 |
load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
|
166 |
s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
|
167 |
-
local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
|
168 |
|
169 |
s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
|
170 |
default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
|
@@ -173,9 +183,8 @@ with app:
|
|
173 |
|
174 |
# Base tables that are not modified subsequent to load
|
175 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
|
176 |
-
all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page",
|
177 |
-
all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
|
178 |
-
cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
|
179 |
|
180 |
# Placeholder for selected entity dataframe row
|
181 |
selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
|
@@ -197,6 +206,7 @@ with app:
|
|
197 |
page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
|
198 |
|
199 |
# Placeholders for elements that may be made visible later below depending on environment variables
|
|
|
200 |
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
|
201 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
|
202 |
|
@@ -226,6 +236,9 @@ with app:
|
|
226 |
textract_job_output_file = gr.File(label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False)
|
227 |
convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
|
228 |
|
|
|
|
|
|
|
229 |
###
|
230 |
# UI DESIGN
|
231 |
###
|
@@ -408,7 +421,7 @@ with app:
|
|
408 |
with gr.Row():
|
409 |
duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
|
410 |
min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
|
411 |
-
|
412 |
|
413 |
gr.Markdown("#### Matching Strategy")
|
414 |
greedy_match_input = gr.Checkbox(
|
@@ -588,7 +601,7 @@ with app:
|
|
588 |
cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
589 |
|
590 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
591 |
-
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
|
592 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
593 |
success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
|
594 |
|
@@ -627,7 +640,7 @@ with app:
|
|
627 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
628 |
|
629 |
convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
630 |
-
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
|
631 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
632 |
success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
|
633 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
@@ -644,7 +657,7 @@ with app:
|
|
644 |
# Upload previous files for modifying redactions
|
645 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
646 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
647 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
|
648 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
649 |
|
650 |
# Manual updates to review di
|
@@ -725,6 +738,7 @@ with app:
|
|
725 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
|
726 |
success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
|
727 |
|
|
|
728 |
undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
|
729 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
730 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
|
@@ -734,16 +748,17 @@ with app:
|
|
734 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
735 |
success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
|
736 |
|
|
|
737 |
reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
738 |
|
739 |
# Convert review file to xfdf Adobe format
|
740 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
741 |
-
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
|
742 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
743 |
|
744 |
# Convert xfdf Adobe file back to review_file.csv
|
745 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
746 |
-
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
|
747 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
748 |
|
749 |
###
|
@@ -764,6 +779,8 @@ with app:
|
|
764 |
###
|
765 |
# IDENTIFY DUPLICATE PAGES
|
766 |
###
|
|
|
|
|
767 |
find_duplicate_pages_btn.click(
|
768 |
fn=run_duplicate_analysis,
|
769 |
inputs=[
|
@@ -772,7 +789,7 @@ with app:
|
|
772 |
min_word_count_input,
|
773 |
min_consecutive_pages_input,
|
774 |
greedy_match_input,
|
775 |
-
|
776 |
],
|
777 |
outputs=[
|
778 |
results_df_preview,
|
@@ -795,9 +812,9 @@ with app:
|
|
795 |
outputs=[results_df_preview, duplicate_files_out, page1_text_preview, page2_text_preview]
|
796 |
)
|
797 |
|
798 |
-
apply_match_btn.click(
|
799 |
-
fn=apply_whole_page_redactions_from_list,
|
800 |
-
inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
|
801 |
outputs=[review_file_df, all_image_annotations_state]).\
|
802 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
803 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
|
6 |
+
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
9 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
|
|
12 |
from tools.auth import authenticate_user
|
13 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
14 |
from tools.custom_csvlogger import CSVLogger_custom
|
15 |
+
from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates
|
16 |
from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
|
17 |
|
18 |
# Suppress downcasting warnings
|
19 |
pd.set_option('future.no_silent_downcasting', True)
|
20 |
|
21 |
+
# Ensure that output folders exist
|
22 |
+
ensure_folder_exists(CONFIG_FOLDER)
|
23 |
+
ensure_folder_exists(OUTPUT_FOLDER)
|
24 |
+
ensure_folder_exists(INPUT_FOLDER)
|
25 |
+
ensure_folder_exists(GRADIO_TEMP_DIR)
|
26 |
+
ensure_folder_exists(MPLCONFIGDIR)
|
27 |
+
ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
|
28 |
+
ensure_folder_exists(ACCESS_LOGS_FOLDER)
|
29 |
+
ensure_folder_exists(USAGE_LOGS_FOLDER)
|
30 |
+
|
31 |
# Convert string environment variables to string or list
|
32 |
if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
|
33 |
else: SAVE_LOGS_TO_CSV = False
|
|
|
99 |
# Backup versions of these objects in case you make a mistake
|
100 |
backup_review_state = gr.Dataframe(visible=False)
|
101 |
backup_image_annotations_state = gr.State([])
|
102 |
+
backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
|
103 |
|
104 |
# Logging variables
|
105 |
access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
|
106 |
+
access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=S3_ACCESS_LOGS_FOLDER, visible=False)
|
107 |
feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
|
108 |
+
feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=S3_FEEDBACK_LOGS_FOLDER, visible=False)
|
109 |
usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
|
110 |
+
usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=S3_USAGE_LOGS_FOLDER, visible=False)
|
111 |
|
112 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
113 |
textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
|
|
|
174 |
|
175 |
load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
|
176 |
s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
|
177 |
+
local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
|
178 |
|
179 |
s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
|
180 |
default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
|
|
|
183 |
|
184 |
# Base tables that are not modified subsequent to load
|
185 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
|
186 |
+
all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page","text", "left","top","width","height"], row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
|
187 |
+
all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
|
|
|
188 |
|
189 |
# Placeholder for selected entity dataframe row
|
190 |
selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
|
|
|
206 |
page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
|
207 |
|
208 |
# Placeholders for elements that may be made visible later below depending on environment variables
|
209 |
+
cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
|
210 |
cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
|
211 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
|
212 |
|
|
|
236 |
textract_job_output_file = gr.File(label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False)
|
237 |
convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
|
238 |
|
239 |
+
## Duplicate search object
|
240 |
+
new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
|
241 |
+
|
242 |
###
|
243 |
# UI DESIGN
|
244 |
###
|
|
|
421 |
with gr.Row():
|
422 |
duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
|
423 |
min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
|
424 |
+
combine_page_text_for_duplicates_bool = gr.Checkbox(value=True, label="Analyse duplicate text by page (off for by line)")
|
425 |
|
426 |
gr.Markdown("#### Matching Strategy")
|
427 |
greedy_match_input = gr.Checkbox(
|
|
|
601 |
cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
|
602 |
|
603 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
604 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
|
605 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
606 |
success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
|
607 |
|
|
|
640 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
641 |
|
642 |
convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
643 |
+
success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
|
644 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
645 |
success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
|
646 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
|
|
657 |
# Upload previous files for modifying redactions
|
658 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
659 |
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
660 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
|
661 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
662 |
|
663 |
# Manual updates to review di
|
|
|
738 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
|
739 |
success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
|
740 |
|
741 |
+
# Undo last redaction exclusion action
|
742 |
undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
|
743 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
744 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
|
|
|
748 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
749 |
success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
|
750 |
|
751 |
+
# Reset the OCR results filter
|
752 |
reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
753 |
|
754 |
# Convert review file to xfdf Adobe format
|
755 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
756 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
|
757 |
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
|
758 |
|
759 |
# Convert xfdf Adobe file back to review_file.csv
|
760 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
|
761 |
+
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
|
762 |
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
763 |
|
764 |
###
|
|
|
779 |
###
|
780 |
# IDENTIFY DUPLICATE PAGES
|
781 |
###
|
782 |
+
#in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox])
|
783 |
+
|
784 |
find_duplicate_pages_btn.click(
|
785 |
fn=run_duplicate_analysis,
|
786 |
inputs=[
|
|
|
789 |
min_word_count_input,
|
790 |
min_consecutive_pages_input,
|
791 |
greedy_match_input,
|
792 |
+
combine_page_text_for_duplicates_bool
|
793 |
],
|
794 |
outputs=[
|
795 |
results_df_preview,
|
|
|
812 |
outputs=[results_df_preview, duplicate_files_out, page1_text_preview, page2_text_preview]
|
813 |
)
|
814 |
|
815 |
+
apply_match_btn.click(fn=create_annotation_objects_from_duplicates, inputs=[results_df_preview, all_line_level_ocr_results_df_base, page_sizes, combine_page_text_for_duplicates_bool], outputs=[new_duplicate_search_annotation_object]).\
|
816 |
+
success(fn=apply_whole_page_redactions_from_list,
|
817 |
+
inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state, combine_page_text_for_duplicates_bool, new_duplicate_search_annotation_object],
|
818 |
outputs=[review_file_df, all_image_annotations_state]).\
|
819 |
success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
|
820 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
tools/config.py
CHANGED
@@ -28,16 +28,6 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
|
|
28 |
|
29 |
return value
|
30 |
|
31 |
-
def ensure_folder_exists(output_folder:str):
|
32 |
-
"""Checks if the specified folder exists, creates it if not."""
|
33 |
-
|
34 |
-
if not os.path.exists(output_folder):
|
35 |
-
# Create the folder if it doesn't exist
|
36 |
-
os.makedirs(output_folder, exist_ok=True)
|
37 |
-
print(f"Created the {output_folder} folder.")
|
38 |
-
else:
|
39 |
-
print(f"The {output_folder} folder already exists.")
|
40 |
-
|
41 |
def add_folder_to_path(folder_path: str):
|
42 |
'''
|
43 |
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
@@ -59,15 +49,12 @@ def add_folder_to_path(folder_path: str):
|
|
59 |
else:
|
60 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
61 |
|
62 |
-
|
63 |
###
|
64 |
# LOAD CONFIG FROM ENV FILE
|
65 |
###
|
66 |
|
67 |
CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
|
68 |
|
69 |
-
ensure_folder_exists(CONFIG_FOLDER)
|
70 |
-
|
71 |
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
|
72 |
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
|
73 |
|
@@ -115,9 +102,6 @@ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
|
115 |
# Retrieving or setting CUSTOM_HEADER_VALUE
|
116 |
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
117 |
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
###
|
122 |
# Image options
|
123 |
###
|
@@ -134,9 +118,6 @@ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
|
|
134 |
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
|
135 |
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
|
136 |
|
137 |
-
ensure_folder_exists(OUTPUT_FOLDER)
|
138 |
-
ensure_folder_exists(INPUT_FOLDER)
|
139 |
-
|
140 |
# Allow for files to be saved in a temporary folder for increased security in some instances
|
141 |
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
142 |
# Create a temporary directory
|
@@ -146,13 +127,9 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
|
146 |
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
|
147 |
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
|
148 |
|
149 |
-
|
150 |
GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
|
151 |
MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
|
152 |
|
153 |
-
ensure_folder_exists(GRADIO_TEMP_DIR)
|
154 |
-
ensure_folder_exists(MPLCONFIGDIR)
|
155 |
-
|
156 |
###
|
157 |
# LOGGING OPTIONS
|
158 |
###
|
@@ -164,32 +141,33 @@ SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
|
|
164 |
|
165 |
USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
|
166 |
|
|
|
|
|
|
|
|
|
167 |
if USE_LOG_SUBFOLDERS == "True":
|
168 |
day_log_subfolder = today_rev + '/'
|
169 |
host_name_subfolder = HOST_NAME + '/'
|
170 |
full_log_subfolder = day_log_subfolder + host_name_subfolder
|
171 |
-
else:
|
172 |
-
full_log_subfolder = ""
|
173 |
|
174 |
-
FEEDBACK_LOGS_FOLDER =
|
175 |
-
ACCESS_LOGS_FOLDER =
|
176 |
-
USAGE_LOGS_FOLDER =
|
177 |
|
178 |
-
|
179 |
-
|
180 |
-
|
|
|
181 |
|
182 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
183 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
184 |
|
185 |
# Further customisation options for CSV logs
|
186 |
-
|
187 |
CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
|
188 |
CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
|
189 |
CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
|
190 |
|
191 |
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
192 |
-
|
193 |
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
|
194 |
|
195 |
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
|
@@ -213,7 +191,6 @@ USAGE_LOG_FILE_NAME = get_or_create_env_var('USAGE_LOG_FILE_NAME', LOG_FILE_NAME
|
|
213 |
FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
|
214 |
|
215 |
|
216 |
-
|
217 |
###
|
218 |
# REDACTION OPTIONS
|
219 |
###
|
|
|
28 |
|
29 |
return value
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
def add_folder_to_path(folder_path: str):
|
32 |
'''
|
33 |
Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
|
|
|
49 |
else:
|
50 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
51 |
|
|
|
52 |
###
|
53 |
# LOAD CONFIG FROM ENV FILE
|
54 |
###
|
55 |
|
56 |
CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
|
57 |
|
|
|
|
|
58 |
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
|
59 |
APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
|
60 |
|
|
|
102 |
# Retrieving or setting CUSTOM_HEADER_VALUE
|
103 |
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
104 |
|
|
|
|
|
|
|
105 |
###
|
106 |
# Image options
|
107 |
###
|
|
|
118 |
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
|
119 |
INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
|
120 |
|
|
|
|
|
|
|
121 |
# Allow for files to be saved in a temporary folder for increased security in some instances
|
122 |
if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
123 |
# Create a temporary directory
|
|
|
127 |
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
|
128 |
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
|
129 |
|
|
|
130 |
GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
|
131 |
MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
|
132 |
|
|
|
|
|
|
|
133 |
###
|
134 |
# LOGGING OPTIONS
|
135 |
###
|
|
|
141 |
|
142 |
USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
|
143 |
|
144 |
+
FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/')
|
145 |
+
ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/')
|
146 |
+
USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/')
|
147 |
+
|
148 |
if USE_LOG_SUBFOLDERS == "True":
|
149 |
day_log_subfolder = today_rev + '/'
|
150 |
host_name_subfolder = HOST_NAME + '/'
|
151 |
full_log_subfolder = day_log_subfolder + host_name_subfolder
|
|
|
|
|
152 |
|
153 |
+
FEEDBACK_LOGS_FOLDER = FEEDBACK_LOGS_FOLDER + full_log_subfolder
|
154 |
+
ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
|
155 |
+
USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
|
156 |
|
157 |
+
|
158 |
+
S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var('S3_FEEDBACK_LOGS_FOLDER', FEEDBACK_LOGS_FOLDER)
|
159 |
+
S3_ACCESS_LOGS_FOLDER = get_or_create_env_var('S3_ACCESS_LOGS_FOLDER', ACCESS_LOGS_FOLDER)
|
160 |
+
S3_USAGE_LOGS_FOLDER = get_or_create_env_var('S3_USAGE_LOGS_FOLDER', USAGE_LOGS_FOLDER)
|
161 |
|
162 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
163 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
164 |
|
165 |
# Further customisation options for CSV logs
|
|
|
166 |
CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
|
167 |
CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
|
168 |
CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
|
169 |
|
170 |
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
|
|
171 |
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
|
172 |
|
173 |
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
|
|
|
191 |
FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
|
192 |
|
193 |
|
|
|
194 |
###
|
195 |
# REDACTION OPTIONS
|
196 |
###
|
tools/file_conversion.py
CHANGED
@@ -455,6 +455,7 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
|
|
455 |
def prepare_image_or_pdf(
|
456 |
file_paths: List[str],
|
457 |
in_redact_method: str,
|
|
|
458 |
latest_file_completed: int = 0,
|
459 |
out_message: List[str] = [],
|
460 |
first_loop_state: bool = False,
|
@@ -506,7 +507,6 @@ def prepare_image_or_pdf(
|
|
506 |
pymupdf_doc = []
|
507 |
all_img_details = []
|
508 |
review_file_csv = pd.DataFrame()
|
509 |
-
all_line_level_ocr_results_df = pd.DataFrame()
|
510 |
out_textract_path = ""
|
511 |
combined_out_message = ""
|
512 |
final_out_message = ""
|
@@ -1289,6 +1289,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
1289 |
'''
|
1290 |
if not all_annotations:
|
1291 |
# Return an empty DataFrame with the expected schema if input is empty
|
|
|
1292 |
return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
1293 |
|
1294 |
# 1. Create initial DataFrame from the list of annotations
|
@@ -1302,7 +1303,6 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
1302 |
else []
|
1303 |
for anno in all_annotations
|
1304 |
]
|
1305 |
-
|
1306 |
})
|
1307 |
|
1308 |
# 2. Calculate the page number using the helper function
|
@@ -1718,6 +1718,75 @@ def fill_missing_box_ids(data_input: dict) -> dict:
|
|
1718 |
# The input dictionary 'data_input' has been modified in place
|
1719 |
return data_input
|
1720 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1721 |
def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
|
1722 |
"""
|
1723 |
Optimized: Generates unique alphanumeric IDs for rows in a DataFrame column
|
|
|
455 |
def prepare_image_or_pdf(
|
456 |
file_paths: List[str],
|
457 |
in_redact_method: str,
|
458 |
+
all_line_level_ocr_results_df:pd.DataFrame,
|
459 |
latest_file_completed: int = 0,
|
460 |
out_message: List[str] = [],
|
461 |
first_loop_state: bool = False,
|
|
|
507 |
pymupdf_doc = []
|
508 |
all_img_details = []
|
509 |
review_file_csv = pd.DataFrame()
|
|
|
510 |
out_textract_path = ""
|
511 |
combined_out_message = ""
|
512 |
final_out_message = ""
|
|
|
1289 |
'''
|
1290 |
if not all_annotations:
|
1291 |
# Return an empty DataFrame with the expected schema if input is empty
|
1292 |
+
print("No annotations found, returning empty dataframe")
|
1293 |
return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
1294 |
|
1295 |
# 1. Create initial DataFrame from the list of annotations
|
|
|
1303 |
else []
|
1304 |
for anno in all_annotations
|
1305 |
]
|
|
|
1306 |
})
|
1307 |
|
1308 |
# 2. Calculate the page number using the helper function
|
|
|
1718 |
# The input dictionary 'data_input' has been modified in place
|
1719 |
return data_input
|
1720 |
|
1721 |
+
def fill_missing_box_ids_each_box(data_input: Dict) -> Dict:
|
1722 |
+
"""
|
1723 |
+
Generates unique alphanumeric IDs for bounding boxes in a list
|
1724 |
+
where the 'id' is missing, blank, or not a 12-character string.
|
1725 |
+
|
1726 |
+
Args:
|
1727 |
+
data_input (Dict): The input dictionary containing 'image' and 'boxes' keys.
|
1728 |
+
'boxes' should be a list of dictionaries, each potentially
|
1729 |
+
with an 'id' key.
|
1730 |
+
|
1731 |
+
Returns:
|
1732 |
+
Dict: The input dictionary with missing/invalid box IDs filled.
|
1733 |
+
Note: The function modifies the input dictionary in place.
|
1734 |
+
"""
|
1735 |
+
# --- Input Validation ---
|
1736 |
+
if not isinstance(data_input, dict):
|
1737 |
+
raise TypeError("Input 'data_input' must be a dictionary.")
|
1738 |
+
if 'boxes' not in data_input or not isinstance(data_input.get('boxes'), list):
|
1739 |
+
# If there are no boxes, there's nothing to do.
|
1740 |
+
return data_input
|
1741 |
+
|
1742 |
+
boxes_list = data_input['boxes']
|
1743 |
+
id_length = 12
|
1744 |
+
character_set = string.ascii_letters + string.digits
|
1745 |
+
|
1746 |
+
# --- 1. Get ALL Existing IDs to Ensure Uniqueness ---
|
1747 |
+
# Collect all valid existing IDs from the entire list first.
|
1748 |
+
existing_ids = set()
|
1749 |
+
for box in boxes_list:
|
1750 |
+
if isinstance(box, dict):
|
1751 |
+
box_id = box.get('id')
|
1752 |
+
if isinstance(box_id, str) and len(box_id) == id_length:
|
1753 |
+
existing_ids.add(box_id)
|
1754 |
+
|
1755 |
+
# --- 2. Iterate and Fill IDs for each box ---
|
1756 |
+
generated_ids_this_run = set() # Keep track of IDs generated in this run
|
1757 |
+
num_filled = 0
|
1758 |
+
|
1759 |
+
for box in boxes_list:
|
1760 |
+
if not isinstance(box, dict):
|
1761 |
+
continue # Skip items in the list that are not dictionaries
|
1762 |
+
|
1763 |
+
box_id = box.get('id')
|
1764 |
+
|
1765 |
+
# Check if this specific box needs a new ID
|
1766 |
+
needs_new_id = (
|
1767 |
+
box_id is None or
|
1768 |
+
not isinstance(box_id, str) or
|
1769 |
+
box_id.strip() == "" or
|
1770 |
+
len(box_id) != id_length
|
1771 |
+
)
|
1772 |
+
|
1773 |
+
if needs_new_id:
|
1774 |
+
# Generate a truly unique ID
|
1775 |
+
while True:
|
1776 |
+
candidate_id = ''.join(random.choices(character_set, k=id_length))
|
1777 |
+
# Check against original IDs and newly generated IDs
|
1778 |
+
if candidate_id not in existing_ids and candidate_id not in generated_ids_this_run:
|
1779 |
+
generated_ids_this_run.add(candidate_id)
|
1780 |
+
box['id'] = candidate_id # Assign the ID to the individual box
|
1781 |
+
num_filled += 1
|
1782 |
+
break # Move to the next box
|
1783 |
+
|
1784 |
+
if num_filled > 0:
|
1785 |
+
print(f"Successfully filled {num_filled} missing or invalid box IDs.")
|
1786 |
+
|
1787 |
+
# The input dictionary 'data_input' has been modified in place
|
1788 |
+
return data_input
|
1789 |
+
|
1790 |
def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
|
1791 |
"""
|
1792 |
Optimized: Generates unique alphanumeric IDs for rows in a DataFrame column
|
tools/file_redaction.py
CHANGED
@@ -287,7 +287,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
287 |
# Call prepare_image_or_pdf only if needed
|
288 |
if prepare_images_flag is not None:
|
289 |
out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox = prepare_image_or_pdf(
|
290 |
-
file_paths_loop, text_extraction_method, 0, out_message, True,
|
291 |
annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
|
292 |
output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
|
293 |
)
|
@@ -887,7 +887,6 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
|
|
887 |
|
888 |
return img_annotation_box, rect
|
889 |
|
890 |
-
|
891 |
def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
|
892 |
'''
|
893 |
Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
|
@@ -932,23 +931,6 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
|
|
932 |
|
933 |
return img_annotation_box, rect
|
934 |
|
935 |
-
# def set_cropbox_safely(page, original_cropbox):
|
936 |
-
# """
|
937 |
-
# Sets the cropbox of a page, ensuring it's not larger than the mediabox.
|
938 |
-
# If the original cropbox is larger, the mediabox is used instead.
|
939 |
-
|
940 |
-
# Args:
|
941 |
-
# page: The PyMuPdf page object.
|
942 |
-
# original_cropbox: The fitz.Rect representing the desired cropbox.
|
943 |
-
# """
|
944 |
-
# mediabox = page.mediabox
|
945 |
-
# if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
|
946 |
-
# #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
|
947 |
-
# page.set_cropbox(mediabox)
|
948 |
-
# else:
|
949 |
-
# page.set_cropbox(original_cropbox)
|
950 |
-
|
951 |
-
|
952 |
def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
|
953 |
"""
|
954 |
Sets the cropbox of a PyMuPDF page safely and defensively.
|
@@ -995,7 +977,6 @@ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
|
|
995 |
else:
|
996 |
page.set_cropbox(original_cropbox)
|
997 |
|
998 |
-
|
999 |
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
|
1000 |
|
1001 |
rect_height = page.rect.height
|
@@ -1788,7 +1769,6 @@ def redact_image_pdf(file_path:str,
|
|
1788 |
|
1789 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
1790 |
|
1791 |
-
|
1792 |
###
|
1793 |
# PIKEPDF TEXT DETECTION/REDACTION
|
1794 |
###
|
|
|
287 |
# Call prepare_image_or_pdf only if needed
|
288 |
if prepare_images_flag is not None:
|
289 |
out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox = prepare_image_or_pdf(
|
290 |
+
file_paths_loop, text_extraction_method, all_line_level_ocr_results_df, 0, out_message, True,
|
291 |
annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
|
292 |
output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
|
293 |
)
|
|
|
887 |
|
888 |
return img_annotation_box, rect
|
889 |
|
|
|
890 |
def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
|
891 |
'''
|
892 |
Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
|
|
|
931 |
|
932 |
return img_annotation_box, rect
|
933 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
934 |
def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
|
935 |
"""
|
936 |
Sets the cropbox of a PyMuPDF page safely and defensively.
|
|
|
977 |
else:
|
978 |
page.set_cropbox(original_cropbox)
|
979 |
|
|
|
980 |
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
|
981 |
|
982 |
rect_height = page.rect.height
|
|
|
1769 |
|
1770 |
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
1771 |
|
|
|
1772 |
###
|
1773 |
# PIKEPDF TEXT DETECTION/REDACTION
|
1774 |
###
|
tools/find_duplicate_pages.py
CHANGED
@@ -4,13 +4,15 @@ import re
|
|
4 |
from tools.helper_functions import OUTPUT_FOLDER
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
-
from typing import List, Tuple
|
|
|
8 |
import gradio as gr
|
9 |
from gradio import Progress
|
10 |
from pathlib import Path
|
11 |
from pymupdf import Document
|
12 |
-
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe
|
13 |
import en_core_web_lg
|
|
|
14 |
nlp = en_core_web_lg.load()
|
15 |
|
16 |
similarity_threshold = 0.95
|
@@ -56,10 +58,11 @@ def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, outp
|
|
56 |
grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
|
57 |
else:
|
58 |
df['line_number_by_page'] = df.groupby('page').cumcount() + 1
|
|
|
59 |
df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
|
60 |
df['page'] = df['page'].astype(int)
|
61 |
|
62 |
-
grouped = df
|
63 |
|
64 |
# Add filename column
|
65 |
grouped['file'] = os.path.basename(file_path)
|
@@ -405,7 +408,7 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./
|
|
405 |
# Return the updated dataframe, the new file list, and clear the preview panes
|
406 |
return updated_df, new_output_paths, None, None
|
407 |
|
408 |
-
def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool,
|
409 |
"""
|
410 |
Wrapper function updated to include the 'greedy_match' boolean.
|
411 |
"""
|
@@ -414,7 +417,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
414 |
return None, None, None
|
415 |
|
416 |
progress(0, desc="Combining input files...")
|
417 |
-
df_combined, _ = combine_ocr_output_text(files, combine_pages=
|
418 |
|
419 |
if df_combined.empty:
|
420 |
gr.Warning("No data found in the uploaded files.")
|
@@ -427,7 +430,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
427 |
min_word_count=min_words,
|
428 |
min_consecutive_pages=int(min_consecutive),
|
429 |
greedy_match=greedy_match,
|
430 |
-
combine_pages=
|
431 |
progress=progress
|
432 |
)
|
433 |
|
@@ -476,132 +479,438 @@ def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: g
|
|
476 |
|
477 |
return page1_data[['page', 'text']], page2_data[['page', 'text']]
|
478 |
|
479 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
480 |
'''
|
481 |
-
Take a list of suggested whole pages to redact and apply it to review file data
|
482 |
'''
|
483 |
-
# Create a copy of annotations to avoid modifying the original
|
484 |
all_annotations = all_existing_annotations.copy()
|
485 |
|
486 |
-
if not pymupdf_doc:
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
507 |
-
|
|
|
|
|
|
|
|
|
508 |
|
509 |
if not whole_pages_list.empty:
|
510 |
list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
520 |
|
521 |
new_annotations = []
|
522 |
-
|
523 |
# Process each page for redaction
|
524 |
for page in list_whole_pages_to_redact:
|
525 |
try:
|
526 |
-
|
527 |
-
|
528 |
-
|
|
|
529 |
continue
|
530 |
-
|
531 |
-
pymupdf_page = pymupdf_doc[page_index]
|
532 |
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
if not page_size:
|
537 |
-
print(f"Page {page} not found in page_sizes object, skipping.")
|
538 |
continue
|
539 |
|
540 |
-
|
541 |
-
|
542 |
-
|
543 |
-
|
544 |
-
|
545 |
-
|
|
|
|
|
546 |
|
547 |
-
|
548 |
-
current_page_existing_boxes_group = next((annot_group for annot_group in all_annotations if annot_group["image"] == image), None)
|
549 |
|
550 |
-
|
551 |
-
|
552 |
-
"
|
553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
554 |
|
555 |
-
|
556 |
-
|
557 |
-
if not any(box["label"] == "Whole page" for box in current_page_existing_boxes_group["boxes"]):
|
558 |
-
current_page_existing_boxes_group["boxes"].append(annotation_box)
|
559 |
|
560 |
-
else:
|
561 |
-
# Optional: Print a message if a whole-page redaction already exists for this page
|
562 |
-
print(f"Whole page redaction for page {page} already exists in annotations, skipping addition.")
|
563 |
-
pass
|
564 |
-
else: # Create new annotation entry
|
565 |
-
|
566 |
-
all_annotations.append(new_annotation_group)
|
567 |
-
|
568 |
-
new_annotations.append(new_annotation_group)
|
569 |
-
|
570 |
except Exception as e:
|
571 |
print(f"Error processing page {page}: {str(e)}")
|
572 |
continue
|
573 |
|
574 |
-
# Convert annotations to dataframe and combine with existing review file
|
575 |
whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
|
576 |
-
|
577 |
-
|
578 |
-
|
|
|
|
|
|
|
|
|
579 |
expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
580 |
-
|
581 |
for col in expected_cols:
|
582 |
-
if col not in review_file_state.columns:
|
583 |
-
|
584 |
-
if col not in whole_page_review_file.columns:
|
585 |
-
whole_page_review_file[col] = None
|
586 |
|
587 |
review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
|
588 |
-
review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
589 |
|
590 |
-
|
591 |
-
|
|
|
|
|
|
|
|
|
|
|
592 |
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
602 |
|
603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
|
605 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
606 |
|
607 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from tools.helper_functions import OUTPUT_FOLDER
|
5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
+
from typing import List, Tuple, Optional, Dict
|
8 |
+
from collections import defaultdict
|
9 |
import gradio as gr
|
10 |
from gradio import Progress
|
11 |
from pathlib import Path
|
12 |
from pymupdf import Document
|
13 |
+
from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
|
14 |
import en_core_web_lg
|
15 |
+
|
16 |
nlp = en_core_web_lg.load()
|
17 |
|
18 |
similarity_threshold = 0.95
|
|
|
58 |
grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
|
59 |
else:
|
60 |
df['line_number_by_page'] = df.groupby('page').cumcount() + 1
|
61 |
+
df['original_page'] = df['page']
|
62 |
df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
|
63 |
df['page'] = df['page'].astype(int)
|
64 |
|
65 |
+
grouped = df #.drop('line_number_by_page', axis=1)
|
66 |
|
67 |
# Add filename column
|
68 |
grouped['file'] = os.path.basename(file_path)
|
|
|
408 |
# Return the updated dataframe, the new file list, and clear the preview panes
|
409 |
return updated_df, new_output_paths, None, None
|
410 |
|
411 |
+
def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
|
412 |
"""
|
413 |
Wrapper function updated to include the 'greedy_match' boolean.
|
414 |
"""
|
|
|
417 |
return None, None, None
|
418 |
|
419 |
progress(0, desc="Combining input files...")
|
420 |
+
df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages)
|
421 |
|
422 |
if df_combined.empty:
|
423 |
gr.Warning("No data found in the uploaded files.")
|
|
|
430 |
min_word_count=min_words,
|
431 |
min_consecutive_pages=int(min_consecutive),
|
432 |
greedy_match=greedy_match,
|
433 |
+
combine_pages=combine_pages,
|
434 |
progress=progress
|
435 |
)
|
436 |
|
|
|
479 |
|
480 |
return page1_data[['page', 'text']], page2_data[['page', 'text']]
|
481 |
|
482 |
+
def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]:
|
483 |
+
"""
|
484 |
+
Finds and returns the size and path information for a specific page.
|
485 |
+
"""
|
486 |
+
return next((size for size in page_sizes if size["page"] == page_num), None)
|
487 |
+
|
488 |
+
def add_new_annotations_to_existing_page_annotations(
|
489 |
+
all_annotations: List[Dict],
|
490 |
+
image_path: str,
|
491 |
+
new_annotation_boxes: List[Dict]
|
492 |
+
) -> Tuple[List[Dict], Dict]:
|
493 |
+
"""
|
494 |
+
Adds a list of new annotation boxes to the annotations for a specific page.
|
495 |
+
|
496 |
+
If the page already has annotations, it extends the list of boxes. If not,
|
497 |
+
it creates a new entry for the page.
|
498 |
+
|
499 |
+
Args:
|
500 |
+
all_annotations (List[Dict]): The current list of all annotation groups.
|
501 |
+
image_path (str): The identifier for the image/page.
|
502 |
+
new_annotation_boxes (List[Dict]): A list of new annotation boxes to add.
|
503 |
+
|
504 |
+
Returns:
|
505 |
+
Tuple[List[Dict], Dict]: A tuple containing:
|
506 |
+
- The updated list of all annotation groups.
|
507 |
+
- The annotation group representing the newly added boxes.
|
508 |
+
"""
|
509 |
+
# Find the annotation group for the current page/image
|
510 |
+
current_page_group = next(
|
511 |
+
(annot_group for annot_group in all_annotations if annot_group["image"] == image_path),
|
512 |
+
None
|
513 |
+
)
|
514 |
+
|
515 |
+
if current_page_group:
|
516 |
+
# Page already has annotations, so extend the list with the new boxes
|
517 |
+
current_page_group["boxes"].extend(new_annotation_boxes)
|
518 |
+
else:
|
519 |
+
# This is the first set of annotations for this page, create a new group
|
520 |
+
new_group = {
|
521 |
+
"image": image_path,
|
522 |
+
"boxes": new_annotation_boxes
|
523 |
+
}
|
524 |
+
all_annotations.append(new_group)
|
525 |
+
|
526 |
+
# This object represents all annotations that were just added for this page
|
527 |
+
newly_added_annotation_group = {
|
528 |
+
"image": image_path,
|
529 |
+
"boxes": new_annotation_boxes
|
530 |
+
}
|
531 |
+
|
532 |
+
return all_annotations, newly_added_annotation_group
|
533 |
+
|
534 |
+
def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=[]):
|
535 |
'''
|
536 |
+
Take a list of suggested whole pages to redact and apply it to review file data.
|
537 |
'''
|
|
|
538 |
all_annotations = all_existing_annotations.copy()
|
539 |
|
540 |
+
if not pymupdf_doc:
|
541 |
+
message = "No document file currently under review."
|
542 |
+
print(f"Warning: {message}")
|
543 |
+
raise Warning(message)
|
544 |
|
545 |
+
list_whole_pages_to_redact = []
|
546 |
+
|
547 |
+
if combine_pages == True:
|
548 |
+
# Get list of pages to redact from either dataframe or file
|
549 |
+
if not duplicate_page_numbers_df.empty:
|
550 |
+
list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
|
551 |
+
elif duplicate_output_paths:
|
552 |
+
expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
|
553 |
+
whole_pages_list = pd.DataFrame() # Initialize empty DataFrame
|
554 |
+
|
555 |
+
for output_file in duplicate_output_paths:
|
556 |
+
# Note: output_file.name might not be available if output_file is just a string path
|
557 |
+
# If it's a Path object or similar, .name is fine. Otherwise, parse from string.
|
558 |
+
file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
|
559 |
+
if expected_duplicate_pages_to_redact_name in file_name_from_path:
|
560 |
+
whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
|
561 |
+
break
|
562 |
+
else:
|
563 |
+
message = "No relevant list of whole pages to redact found."
|
564 |
+
print(message)
|
565 |
+
raise Warning(message)
|
566 |
|
567 |
if not whole_pages_list.empty:
|
568 |
list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
|
569 |
+
|
570 |
+
list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
|
571 |
+
|
572 |
+
else:
|
573 |
+
if not new_annotations_with_bounding_boxes:
|
574 |
+
message = "Can't find any new annotations to add"
|
575 |
+
print(message)
|
576 |
+
raise Warning(message)
|
577 |
+
|
578 |
+
list_whole_pages_to_redact = []
|
579 |
+
for annotation in new_annotations_with_bounding_boxes:
|
580 |
+
match = re.search(r'_(\d+)\.png$', annotation["image"])
|
581 |
+
if match:
|
582 |
+
page = int(match.group(1)) + 1
|
583 |
+
list_whole_pages_to_redact.append(page)
|
584 |
+
else:
|
585 |
+
print(f"Warning: Could not extract page number from {annotation['image']}")
|
586 |
+
|
587 |
+
list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
|
588 |
+
|
589 |
|
590 |
new_annotations = []
|
|
|
591 |
# Process each page for redaction
|
592 |
for page in list_whole_pages_to_redact:
|
593 |
try:
|
594 |
+
page_num = int(page)
|
595 |
+
page_index = page_num - 1
|
596 |
+
if not (0 <= page_index < len(pymupdf_doc)):
|
597 |
+
print(f"Page {page_num} is out of bounds, skipping.")
|
598 |
continue
|
|
|
|
|
599 |
|
600 |
+
page_info = get_page_image_info(page_num, page_sizes)
|
601 |
+
if not page_info:
|
602 |
+
print(f"Page {page_num} not found in page_sizes, skipping.")
|
|
|
|
|
603 |
continue
|
604 |
|
605 |
+
image_path = page_info["image_path"]
|
606 |
+
page_annotation_group = next((g for g in all_annotations if g["image"] == image_path), None)
|
607 |
+
if page_annotation_group and any(box["label"] == "Whole page" for box in page_annotation_group["boxes"]):
|
608 |
+
print(f"Whole page redaction for page {page_num} already exists, skipping.")
|
609 |
+
continue
|
610 |
+
|
611 |
+
# --- Create a LIST of boxes to add.---
|
612 |
+
boxes_to_add = []
|
613 |
|
614 |
+
pymupdf_page = pymupdf_doc[page_index]
|
|
|
615 |
|
616 |
+
if combine_pages==True:
|
617 |
+
whole_page_box = redact_whole_pymupdf_page(
|
618 |
+
rect_height=page_info["cropbox_height"],
|
619 |
+
rect_width=page_info["cropbox_width"],
|
620 |
+
page=pymupdf_page, border=0.005, redact_pdf=False
|
621 |
+
)
|
622 |
+
boxes_to_add.append(whole_page_box)
|
623 |
+
else:
|
624 |
+
# Find the specific annotation group that matches the current page's image path
|
625 |
+
relevant_box_group = next(
|
626 |
+
(group for group in new_annotations_with_bounding_boxes if group.get('image') == image_path),
|
627 |
+
None # Default to None if no match is found
|
628 |
+
)
|
629 |
+
|
630 |
+
# Check if we found a matching group of boxes for this page
|
631 |
+
if relevant_box_group:
|
632 |
+
boxes_to_add.extend(relevant_box_group['boxes'])
|
633 |
+
else:
|
634 |
+
# This case would be unexpected, but it's good to handle.
|
635 |
+
# It means a page was in list_whole_pages_to_redact but had no
|
636 |
+
# corresponding boxes generated in new_annotations_with_bounding_boxes.
|
637 |
+
print(f"Warning: No new annotation boxes found for page {page_num} ({image_path}).")
|
638 |
+
|
639 |
+
# === Use the modified helper function to add a LIST of boxes ===
|
640 |
+
all_annotations, new_annotations_for_page = add_new_annotations_to_existing_page_annotations(
|
641 |
+
all_annotations=all_annotations,
|
642 |
+
image_path=image_path,
|
643 |
+
new_annotation_boxes=boxes_to_add # Pass the list here
|
644 |
+
)
|
645 |
|
646 |
+
new_annotations_for_page = fill_missing_box_ids_each_box(new_annotations_for_page)
|
647 |
+
new_annotations.append(new_annotations_for_page)
|
|
|
|
|
648 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
649 |
except Exception as e:
|
650 |
print(f"Error processing page {page}: {str(e)}")
|
651 |
continue
|
652 |
|
|
|
653 |
whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
|
654 |
+
|
655 |
+
if whole_page_review_file.empty:
|
656 |
+
message = "No new whole page redactions were added."
|
657 |
+
print(message)
|
658 |
+
gr.Info(message)
|
659 |
+
return review_file_state, all_annotations
|
660 |
+
|
661 |
expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
|
|
|
662 |
for col in expected_cols:
|
663 |
+
if col not in review_file_state.columns: review_file_state[col] = pd.NA
|
664 |
+
if col not in whole_page_review_file.columns: whole_page_review_file[col] = pd.NA
|
|
|
|
|
665 |
|
666 |
review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
|
667 |
+
review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
|
668 |
+
review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
|
669 |
+
|
670 |
+
out_message = "Successfully created whole page redactions."
|
671 |
+
print(out_message)
|
672 |
+
gr.Info(out_message)
|
673 |
+
|
674 |
+
return review_file_out, all_annotations
|
675 |
|
676 |
+
|
677 |
+
# --- 1. Helper Function to Parse the Combined Page/Line ID ---
|
678 |
+
def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
|
679 |
+
"""
|
680 |
+
Parses a combined page and line number ID into a (page, line) tuple.
|
681 |
+
Assumes the ID is a 10-digit number where the first 5 are the page
|
682 |
+
and the last 5 are the line number.
|
683 |
|
684 |
+
Example: 100027 -> (1, 27)
|
685 |
+
200005 -> (2, 5)
|
686 |
+
"""
|
687 |
+
# zfill ensures the string is padded with leading zeros to 10 characters
|
688 |
+
s_id = str(combined_id).zfill(10)
|
689 |
+
page = int(s_id[:5])
|
690 |
+
line = int(s_id[5:])
|
691 |
+
return page, line
|
692 |
+
|
693 |
+
# def create_annotations_from_ocr_outputs(ocr_results_df_lines_to_annotate:pd.DataFrame):
|
694 |
+
# '''
|
695 |
+
# Create a set of annotation boxes based on selected ocr_results_df lines.
|
696 |
+
# '''
|
697 |
+
# annotations_by_page = []
|
698 |
+
|
699 |
+
# # --- Build Annotation Boxes for each selected line ---
|
700 |
+
# for _, line_row in ocr_results_df_lines_to_annotate.iterrows():
|
701 |
+
# # The coordinates are relative, so xmax = left + width and ymax = top + height
|
702 |
+
# box = {
|
703 |
+
# "label": "Similar Text", # Or any other label you prefer
|
704 |
+
# "xmin": line_row['left'],
|
705 |
+
# "ymin": line_row['top'] + line_row['height'],
|
706 |
+
# "xmax": line_row['left'] + line_row['width'],
|
707 |
+
# "ymax": line_row['top'] ,
|
708 |
+
# "text": line_row['text']
|
709 |
+
# }
|
710 |
+
# # --- 6. Group the box by its page number ---
|
711 |
+
# page_number = line_row['page']
|
712 |
+
# annotations_by_page[page_number].append(box)
|
713 |
+
|
714 |
+
# return annotations_by_page
|
715 |
+
|
716 |
+
# def create_annotation_objects_from_duplicates(
|
717 |
+
# duplicates_df: pd.DataFrame,
|
718 |
+
# ocr_results_df: pd.DataFrame,
|
719 |
+
# combine_pages:bool=False
|
720 |
+
# ) -> List[Dict]:
|
721 |
+
# """
|
722 |
+
# Creates structured annotation objects from selected ocr outputs.
|
723 |
+
|
724 |
+
# Args:
|
725 |
+
# duplicates_df (pd.DataFrame): DataFrame containing duplicate ranges with
|
726 |
+
# columns like 'Page2_Start_Page' and 'Page2_End_Page'.
|
727 |
+
# ocr_results_df (pd.DataFrame): DataFrame with OCR results, including columns
|
728 |
+
# 'page', 'text', 'left', 'top', 'width', 'height'.
|
729 |
+
|
730 |
+
# Returns:
|
731 |
+
# List[Dict]: A list of dictionaries, where each dict represents a page and its
|
732 |
+
# list of annotation boxes, in the format:
|
733 |
+
# [{"page": 1, "boxes": [...]}, {"page": 2, "boxes": [...]}]
|
734 |
+
# """
|
735 |
+
# annotations_by_page = []
|
736 |
+
|
737 |
+
# if combine_pages == False:
|
738 |
+
|
739 |
+
# # --- 2. Prepare OCR Data: Add a line number column if it doesn't exist ---
|
740 |
+
# if 'line_number_by_page' not in ocr_results_df.columns:
|
741 |
+
# print("Generating 'line_number_by_page' for ocr_results_df...")
|
742 |
+
# # Sort by page and original position to ensure correct line numbering
|
743 |
+
# ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
|
744 |
+
# ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
|
745 |
+
|
746 |
+
# # Use defaultdict to easily append to lists for each page
|
747 |
+
# annotations_by_page = defaultdict(list)
|
748 |
+
|
749 |
+
# # --- 3. Iterate through each duplicate range ---
|
750 |
+
# for _, row in duplicates_df.iterrows():
|
751 |
+
# # Parse the start and end page/line numbers from the duplicate row
|
752 |
+
# start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
|
753 |
+
# end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
|
754 |
+
|
755 |
+
# # --- 4. Select OCR Lines based on the range ---
|
756 |
+
# # This logic correctly handles ranges within a single page and across multiple pages
|
757 |
+
# if start_page == end_page:
|
758 |
+
# # Simple case: the range is on a single page
|
759 |
+
# condition = (
|
760 |
+
# (ocr_results_df['page'] == start_page) &
|
761 |
+
# (ocr_results_df['line_number_by_page'].between(start_line, end_line))
|
762 |
+
# )
|
763 |
+
# else:
|
764 |
+
# # Complex case: the range spans multiple pages
|
765 |
+
# # Condition for the first page in the range
|
766 |
+
# cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
|
767 |
+
# # Condition for all pages between the start and end
|
768 |
+
# cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
|
769 |
+
# # Condition for the last page in the range
|
770 |
+
# cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
|
771 |
+
|
772 |
+
# condition = cond_start | cond_middle | cond_end
|
773 |
|
774 |
+
# lines_to_annotate = ocr_results_df[condition]
|
775 |
+
|
776 |
+
# annotations_by_page = create_annotations_from_ocr_outputs(lines_to_annotate)
|
777 |
+
|
778 |
+
# # --- Format the final output list ---
|
779 |
+
# final_output = []
|
780 |
+
# # Sort by page number for a predictable order
|
781 |
+
# for page, boxes in sorted(annotations_by_page.items()):
|
782 |
+
# final_output.append({
|
783 |
+
# "page": page,
|
784 |
+
# "boxes": boxes
|
785 |
+
# })
|
786 |
+
|
787 |
+
# return final_output
|
788 |
+
|
789 |
+
def create_annotation_objects_from_duplicates(
|
790 |
+
duplicates_df: pd.DataFrame,
|
791 |
+
ocr_results_df: pd.DataFrame,
|
792 |
+
page_sizes: List[Dict],
|
793 |
+
combine_pages:bool=False
|
794 |
+
) -> List[Dict]:
|
795 |
+
"""
|
796 |
+
Creates structured annotation objects from duplicate line ranges, mapping
|
797 |
+
page numbers to image paths.
|
798 |
|
799 |
+
Args:
|
800 |
+
duplicates_df (pd.DataFrame): DataFrame with duplicate ranges.
|
801 |
+
ocr_results_df (pd.DataFrame): DataFrame with OCR results.
|
802 |
+
page_sizes (List[Dict]): A list of dictionaries mapping page numbers to image paths and other metadata. Expected format: [{"page": 1, "image_path": "path/to/img.png", ...}]
|
803 |
+
combine_pages (bool): A boolean that determines whether in previous functions, all text from a page was combined (True). This function will only run if this is False.
|
804 |
+
|
805 |
+
Returns:
|
806 |
+
List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
|
807 |
+
"""
|
808 |
+
final_output = []
|
809 |
+
|
810 |
+
if combine_pages == False:
|
811 |
+
# --- NEW: Create an efficient lookup map from page number to image path ---
|
812 |
+
page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
|
813 |
+
|
814 |
+
# Prepare OCR Data: Add a line number column if it doesn't exist
|
815 |
+
if 'line_number_by_page' not in ocr_results_df.columns:
|
816 |
+
ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
|
817 |
+
ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
|
818 |
+
|
819 |
+
annotations_by_page = defaultdict(list)
|
820 |
+
|
821 |
+
# Iterate through each duplicate range (this logic is unchanged)
|
822 |
+
for _, row in duplicates_df.iterrows():
|
823 |
+
start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
|
824 |
+
end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
|
825 |
+
|
826 |
+
# Select OCR Lines based on the range (this logic is unchanged)
|
827 |
+
if start_page == end_page:
|
828 |
+
condition = (
|
829 |
+
(ocr_results_df['page'] == start_page) &
|
830 |
+
(ocr_results_df['line_number_by_page'].between(start_line, end_line))
|
831 |
+
)
|
832 |
+
else:
|
833 |
+
cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
|
834 |
+
cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
|
835 |
+
cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
|
836 |
+
condition = cond_start | cond_middle | cond_end
|
837 |
+
|
838 |
+
lines_to_annotate = ocr_results_df[condition]
|
839 |
+
|
840 |
+
# Build and group annotation boxes by page number (this logic is unchanged)
|
841 |
+
for _, line_row in lines_to_annotate.iterrows():
|
842 |
+
box = {
|
843 |
+
"label": "Duplicate text",
|
844 |
+
"color": (0,0,0),
|
845 |
+
"xmin": line_row['left'],
|
846 |
+
"ymin": line_row['top'],
|
847 |
+
"xmax": line_row['left'] + line_row['width'],
|
848 |
+
"ymax": line_row['top'] + line_row['height'],
|
849 |
+
"text": line_row['text'],
|
850 |
+
"id": "" # to be filled in after
|
851 |
+
}
|
852 |
+
page_number = line_row['page']
|
853 |
|
854 |
+
|
855 |
+
annotations_by_page[page_number].append(box)
|
856 |
+
|
857 |
+
print("annotations_by_page:", annotations_by_page)
|
858 |
+
|
859 |
+
# --- Format the final output list using the page-to-image map ---
|
860 |
+
final_output = []
|
861 |
+
# Sort by page number for a predictable order
|
862 |
+
for page_num, boxes in sorted(annotations_by_page.items()):
|
863 |
+
# Look up the image path using the page number
|
864 |
+
image_path = page_to_image_map.get(page_num)
|
865 |
+
|
866 |
+
if image_path:
|
867 |
+
page_boxes = {
|
868 |
+
"image": image_path,
|
869 |
+
"boxes": boxes
|
870 |
+
}
|
871 |
+
|
872 |
+
# Fill in missing IDs for the new data entries
|
873 |
+
page_boxes = fill_missing_box_ids_each_box(page_boxes)
|
874 |
+
|
875 |
+
# Add the annotation group using 'image' as the key
|
876 |
+
final_output.append(page_boxes)
|
877 |
+
else:
|
878 |
+
# Handle cases where a page might not have a corresponding image path
|
879 |
+
print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
|
880 |
+
f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
|
881 |
+
|
882 |
+
print("final_output:", final_output)
|
883 |
+
|
884 |
+
return final_output
|
885 |
+
|
886 |
+
# --- Example Usage ---
|
887 |
+
|
888 |
+
# 1. Create your example DataFrames
|
889 |
+
# duplicates_data = {
|
890 |
+
# 'Page1_File': ['doc_a.csv'],
|
891 |
+
# 'Page1_Start_Page': [100009],
|
892 |
+
# 'Page1_End_Page': [100021],
|
893 |
+
# 'Page2_File': ['doc_a.csv'],
|
894 |
+
# 'Page2_Start_Page': [100027], # Page 1, Line 27
|
895 |
+
# 'Page2_End_Page': [200005], # Page 2, Line 5
|
896 |
+
# }
|
897 |
+
# duplicates_df = pd.DataFrame(duplicates_data)
|
898 |
+
|
899 |
+
# ocr_data = {
|
900 |
+
# 'page': [1]*30 + [2]*10, # 30 lines on page 1, 10 on page 2
|
901 |
+
# 'text': [f"Text on page {p}, line {l}" for p in [1, 2] for l in range(1, (31 if p==1 else 11))],
|
902 |
+
# # Example coordinates (using small, consistent values for demonstration)
|
903 |
+
# 'left': [0.1] * 40,
|
904 |
+
# 'top': [i*0.02 for i in range(30)] + [i*0.02 for i in range(10)],
|
905 |
+
# 'width': [0.8] * 40,
|
906 |
+
# 'height': [0.015] * 40,
|
907 |
+
# }
|
908 |
+
# ocr_results_df = pd.DataFrame(ocr_data)
|
909 |
+
|
910 |
+
|
911 |
+
# # 2. Run the function
|
912 |
+
# generated_annotations = create_annotation_objects_from_duplicates(duplicates_df, ocr_results_df)
|
913 |
+
|
914 |
+
# # 3. Print the result
|
915 |
+
# import json
|
916 |
+
# print(json.dumps(generated_annotations, indent=2))
|
tools/helper_functions.py
CHANGED
@@ -89,6 +89,16 @@ def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str,
|
|
89 |
]
|
90 |
return cost_code_df
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
def update_dataframe(df:pd.DataFrame):
|
93 |
df_copy = df.copy()
|
94 |
return df_copy
|
|
|
89 |
]
|
90 |
return cost_code_df
|
91 |
|
92 |
+
def ensure_folder_exists(output_folder:str):
|
93 |
+
"""Checks if the specified folder exists, creates it if not."""
|
94 |
+
|
95 |
+
if not os.path.exists(output_folder):
|
96 |
+
# Create the folder if it doesn't exist
|
97 |
+
os.makedirs(output_folder, exist_ok=True)
|
98 |
+
print(f"Created the {output_folder} folder.")
|
99 |
+
else:
|
100 |
+
print(f"The {output_folder} folder already exists.")
|
101 |
+
|
102 |
def update_dataframe(df:pd.DataFrame):
|
103 |
df_copy = df.copy()
|
104 |
return df_copy
|
tools/redaction_review.py
CHANGED
@@ -1429,7 +1429,6 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List
|
|
1429 |
|
1430 |
return output_paths
|
1431 |
|
1432 |
-
|
1433 |
### Convert xfdf coordinates back to image for app
|
1434 |
|
1435 |
def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|
|
|
1429 |
|
1430 |
return output_paths
|
1431 |
|
|
|
1432 |
### Convert xfdf coordinates back to image for app
|
1433 |
|
1434 |
def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|