Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jul 3

Commit

c8ffcd4

1 Parent(s): e424038

Further updates to line level duplicate identification

Browse files

Files changed (7) hide show

app.py +38 -21
tools/config.py +11 -34
tools/file_conversion.py +71 -2
tools/file_redaction.py +1 -21
tools/find_duplicate_pages.py +409 -100
tools/helper_functions.py +10 -0
tools/redaction_review.py +0 -1

app.py CHANGED Viewed

@@ -2,8 +2,8 @@ import os
 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME
-from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
@@ -12,12 +12,22 @@ from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
-from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list
 from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
 # Convert string environment variables to string or list
 if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
 else: SAVE_LOGS_TO_CSV = False
@@ -89,15 +99,15 @@ with app:
     # Backup versions of these objects in case you make a mistake
     backup_review_state = gr.Dataframe(visible=False)
     backup_image_annotations_state = gr.State([])
-    backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
     # Logging variables
     access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
-    access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
     feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
-    feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
     usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
-    usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
@@ -164,7 +174,7 @@ with app:
     load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
     s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
-    local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
     s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
     default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
@@ -173,9 +183,8 @@ with app:
     # Base tables that are not modified subsequent to load
     recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
-    all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
-    all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
-    cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
     # Placeholder for selected entity dataframe row
     selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
@@ -197,6 +206,7 @@ with app:
     page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
     # Placeholders for elements that may be made visible later below depending on environment variables
     cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
     cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
@@ -226,6 +236,9 @@ with app:
     textract_job_output_file = gr.File(label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False)
     convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
     ###
     # UI DESIGN
     ###
@@ -408,7 +421,7 @@ with app:
                 with gr.Row():
                     duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
                     min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
-                    duplicates_by_line_or_page_bool = gr.Checkbox(value=True, label="Analyse duplicate text by page (off for by line)")
                 gr.Markdown("#### Matching Strategy")
                 greedy_match_input = gr.Checkbox(
@@ -588,7 +601,7 @@ with app:
         cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-    success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
     success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
@@ -627,7 +640,7 @@ with app:
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
         success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
         success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
@@ -644,7 +657,7 @@ with app:
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Manual updates to review di
@@ -725,6 +738,7 @@ with app:
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
     undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
@@ -734,16 +748,17 @@ with app:
         success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
         success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
     reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
@@ -764,6 +779,8 @@ with app:
     ###
     # IDENTIFY DUPLICATE PAGES
     ###
     find_duplicate_pages_btn.click(
         fn=run_duplicate_analysis,
         inputs=[
@@ -772,7 +789,7 @@ with app:
             min_word_count_input,
             min_consecutive_pages_input,
             greedy_match_input,
-            duplicates_by_line_or_page_bool
         ],
         outputs=[
             results_df_preview,
@@ -795,9 +812,9 @@ with app:
         outputs=[results_df_preview, duplicate_files_out, page1_text_preview, page2_text_preview]
     )
-    apply_match_btn.click(
-        fn=apply_whole_page_redactions_from_list,
-        inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
         outputs=[review_file_df, all_image_annotations_state]).\
         success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])

 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
+from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
+from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates
 from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
+# Ensure that output folders exist
+ensure_folder_exists(CONFIG_FOLDER)
+ensure_folder_exists(OUTPUT_FOLDER)
+ensure_folder_exists(INPUT_FOLDER)
+ensure_folder_exists(GRADIO_TEMP_DIR)
+ensure_folder_exists(MPLCONFIGDIR)
+ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
+ensure_folder_exists(ACCESS_LOGS_FOLDER)
+ensure_folder_exists(USAGE_LOGS_FOLDER)
 # Convert string environment variables to string or list
 if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
 else: SAVE_LOGS_TO_CSV = False
     # Backup versions of these objects in case you make a mistake
     backup_review_state = gr.Dataframe(visible=False)
     backup_image_annotations_state = gr.State([])
+    backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
     # Logging variables
     access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
+    access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=S3_ACCESS_LOGS_FOLDER, visible=False)
     feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
+    feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=S3_FEEDBACK_LOGS_FOLDER, visible=False)
     usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
+    usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=S3_USAGE_LOGS_FOLDER, visible=False)
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
     load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
     s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
+    local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
     s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
     default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
     # Base tables that are not modified subsequent to load
     recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
+    all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page","text", "left","top","width","height"], row_count = (0, "dynamic"),  label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
+    all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
     # Placeholder for selected entity dataframe row
     selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
     page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
     # Placeholders for elements that may be made visible later below depending on environment variables
+    cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
     cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
     cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
     textract_job_output_file = gr.File(label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False)
     convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
+    ## Duplicate search object
+    new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
     ###
     # UI DESIGN
     ###
                 with gr.Row():
                     duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
                     min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
+                    combine_page_text_for_duplicates_bool = gr.Checkbox(value=True, label="Analyse duplicate text by page (off for by line)")
                 gr.Markdown("#### Matching Strategy")
                 greedy_match_input = gr.Checkbox(
         cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+    success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
     success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
         success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
         success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
         success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Manual updates to review di
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
         success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
+    # Undo last redaction exclusion action
     undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
         success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
         success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
+    # Reset the OCR results filter
     reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
     ###
     # IDENTIFY DUPLICATE PAGES
     ###
+    #in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox])
     find_duplicate_pages_btn.click(
         fn=run_duplicate_analysis,
         inputs=[
             min_word_count_input,
             min_consecutive_pages_input,
             greedy_match_input,
+            combine_page_text_for_duplicates_bool
         ],
         outputs=[
             results_df_preview,
         outputs=[results_df_preview, duplicate_files_out, page1_text_preview, page2_text_preview]
     )
+    apply_match_btn.click(fn=create_annotation_objects_from_duplicates, inputs=[results_df_preview, all_line_level_ocr_results_df_base, page_sizes, combine_page_text_for_duplicates_bool], outputs=[new_duplicate_search_annotation_object]).\
+        success(fn=apply_whole_page_redactions_from_list,
+        inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state, combine_page_text_for_duplicates_bool, new_duplicate_search_annotation_object],
         outputs=[review_file_df, all_image_annotations_state]).\
         success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])

tools/config.py CHANGED Viewed

@@ -28,16 +28,6 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
     return value
-def ensure_folder_exists(output_folder:str):
-    """Checks if the specified folder exists, creates it if not."""
-    if not os.path.exists(output_folder):
-        # Create the folder if it doesn't exist
-        os.makedirs(output_folder, exist_ok=True)
-        print(f"Created the {output_folder} folder.")
-    else:
-        print(f"The {output_folder} folder already exists.")
 def add_folder_to_path(folder_path: str):
     '''
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
@@ -59,15 +49,12 @@ def add_folder_to_path(folder_path: str):
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 ###
 # LOAD CONFIG FROM ENV FILE
 ###
 CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
-ensure_folder_exists(CONFIG_FOLDER)
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
 APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
@@ -115,9 +102,6 @@ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
 # Retrieving or setting CUSTOM_HEADER_VALUE
 CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
 ###
 # Image options
 ###
@@ -134,9 +118,6 @@ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
 OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
 INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
-ensure_folder_exists(OUTPUT_FOLDER)
-ensure_folder_exists(INPUT_FOLDER)
 # Allow for files to be saved in a temporary folder for increased security in some instances
 if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
@@ -146,13 +127,9 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
         if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
         if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
 GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
 MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
-ensure_folder_exists(GRADIO_TEMP_DIR)
-ensure_folder_exists(MPLCONFIGDIR)
 ###
 # LOGGING OPTIONS
 ###
@@ -164,32 +141,33 @@ SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
 USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
 if USE_LOG_SUBFOLDERS == "True":
     day_log_subfolder = today_rev + '/'
     host_name_subfolder = HOST_NAME + '/'
     full_log_subfolder = day_log_subfolder + host_name_subfolder
-else:
-    full_log_subfolder = ""
-FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
-ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
-USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
-ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
-ensure_folder_exists(ACCESS_LOGS_FOLDER)
-ensure_folder_exists(USAGE_LOGS_FOLDER)
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 # Further customisation options for CSV logs
 CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
 CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
 CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number",	"total_page_count",	"textract_query_number", "pii_detection_method", "comprehend_query_number",  "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
 SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
 ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
@@ -213,7 +191,6 @@ USAGE_LOG_FILE_NAME = get_or_create_env_var('USAGE_LOG_FILE_NAME', LOG_FILE_NAME
 FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
 ###
 # REDACTION OPTIONS
 ###

     return value
 def add_folder_to_path(folder_path: str):
     '''
     Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
     else:
         print(f"Folder not found at {folder_path} - not added to PATH")
 ###
 # LOAD CONFIG FROM ENV FILE
 ###
 CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
 # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
 APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
 # Retrieving or setting CUSTOM_HEADER_VALUE
 CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
 ###
 # Image options
 ###
 OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
 INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
 # Allow for files to be saved in a temporary folder for increased security in some instances
 if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
         if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
         if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
 GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
 MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
 ###
 # LOGGING OPTIONS
 ###
 USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
+FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/')
+ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/')
+USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/')
 if USE_LOG_SUBFOLDERS == "True":
     day_log_subfolder = today_rev + '/'
     host_name_subfolder = HOST_NAME + '/'
     full_log_subfolder = day_log_subfolder + host_name_subfolder
+    FEEDBACK_LOGS_FOLDER = FEEDBACK_LOGS_FOLDER + full_log_subfolder
+    ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
+    USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
+S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var('S3_FEEDBACK_LOGS_FOLDER', FEEDBACK_LOGS_FOLDER)
+S3_ACCESS_LOGS_FOLDER = get_or_create_env_var('S3_ACCESS_LOGS_FOLDER', ACCESS_LOGS_FOLDER)
+S3_USAGE_LOGS_FOLDER = get_or_create_env_var('S3_USAGE_LOGS_FOLDER', USAGE_LOGS_FOLDER)
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 # Further customisation options for CSV logs
 CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
 CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
 CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number",	"total_page_count",	"textract_query_number", "pii_detection_method", "comprehend_query_number",  "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
 ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
 SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
 ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
 FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
 ###
 # REDACTION OPTIONS
 ###

tools/file_conversion.py CHANGED Viewed

@@ -455,6 +455,7 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
 def prepare_image_or_pdf(
     file_paths: List[str],
     in_redact_method: str,
     latest_file_completed: int = 0,
     out_message: List[str] = [],
     first_loop_state: bool = False,
@@ -506,7 +507,6 @@ def prepare_image_or_pdf(
     pymupdf_doc = []
     all_img_details = []
     review_file_csv = pd.DataFrame()
-    all_line_level_ocr_results_df = pd.DataFrame()
     out_textract_path = ""
     combined_out_message = ""
     final_out_message = ""
@@ -1289,6 +1289,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
     '''
     if not all_annotations:
         # Return an empty DataFrame with the expected schema if input is empty
         return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
     # 1. Create initial DataFrame from the list of annotations
@@ -1302,7 +1303,6 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
                     else []
                     for anno in all_annotations
                 ]
     })
     # 2. Calculate the page number using the helper function
@@ -1718,6 +1718,75 @@ def fill_missing_box_ids(data_input: dict) -> dict:
     # The input dictionary 'data_input' has been modified in place
     return data_input
 def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
     """
     Optimized: Generates unique alphanumeric IDs for rows in a DataFrame column

 def prepare_image_or_pdf(
     file_paths: List[str],
     in_redact_method: str,
+    all_line_level_ocr_results_df:pd.DataFrame,
     latest_file_completed: int = 0,
     out_message: List[str] = [],
     first_loop_state: bool = False,
     pymupdf_doc = []
     all_img_details = []
     review_file_csv = pd.DataFrame()
     out_textract_path = ""
     combined_out_message = ""
     final_out_message = ""
     '''
     if not all_annotations:
         # Return an empty DataFrame with the expected schema if input is empty
+        print("No annotations found, returning empty dataframe")
         return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
     # 1. Create initial DataFrame from the list of annotations
                     else []
                     for anno in all_annotations
                 ]
     })
     # 2. Calculate the page number using the helper function
     # The input dictionary 'data_input' has been modified in place
     return data_input
+def fill_missing_box_ids_each_box(data_input: Dict) -> Dict:
+    """
+    Generates unique alphanumeric IDs for bounding boxes in a list
+    where the 'id' is missing, blank, or not a 12-character string.
+    Args:
+        data_input (Dict): The input dictionary containing 'image' and 'boxes' keys.
+                           'boxes' should be a list of dictionaries, each potentially
+                           with an 'id' key.
+    Returns:
+        Dict: The input dictionary with missing/invalid box IDs filled.
+              Note: The function modifies the input dictionary in place.
+    """
+    # --- Input Validation ---
+    if not isinstance(data_input, dict):
+        raise TypeError("Input 'data_input' must be a dictionary.")
+    if 'boxes' not in data_input or not isinstance(data_input.get('boxes'), list):
+        # If there are no boxes, there's nothing to do.
+        return data_input
+    boxes_list = data_input['boxes']
+    id_length = 12
+    character_set = string.ascii_letters + string.digits
+    # --- 1. Get ALL Existing IDs to Ensure Uniqueness ---
+    # Collect all valid existing IDs from the entire list first.
+    existing_ids = set()
+    for box in boxes_list:
+        if isinstance(box, dict):
+            box_id = box.get('id')
+            if isinstance(box_id, str) and len(box_id) == id_length:
+                existing_ids.add(box_id)
+    # --- 2. Iterate and Fill IDs for each box ---
+    generated_ids_this_run = set() # Keep track of IDs generated in this run
+    num_filled = 0
+    for box in boxes_list:
+        if not isinstance(box, dict):
+            continue # Skip items in the list that are not dictionaries
+        box_id = box.get('id')
+        # Check if this specific box needs a new ID
+        needs_new_id = (
+            box_id is None or
+            not isinstance(box_id, str) or
+            box_id.strip() == "" or
+            len(box_id) != id_length
+        )
+        if needs_new_id:
+            # Generate a truly unique ID
+            while True:
+                candidate_id = ''.join(random.choices(character_set, k=id_length))
+                # Check against original IDs and newly generated IDs
+                if candidate_id not in existing_ids and candidate_id not in generated_ids_this_run:
+                    generated_ids_this_run.add(candidate_id)
+                    box['id'] = candidate_id # Assign the ID to the individual box
+                    num_filled += 1
+                    break # Move to the next box
+    if num_filled > 0:
+        print(f"Successfully filled {num_filled} missing or invalid box IDs.")
+    # The input dictionary 'data_input' has been modified in place
+    return data_input
 def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
     """
     Optimized: Generates unique alphanumeric IDs for rows in a DataFrame column

tools/file_redaction.py CHANGED Viewed

@@ -287,7 +287,7 @@ def choose_and_run_redactor(file_paths:List[str],
     # Call prepare_image_or_pdf only if needed
     if prepare_images_flag is not None:
         out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox = prepare_image_or_pdf(
-            file_paths_loop, text_extraction_method, 0, out_message, True,
             annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
             output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
         )
@@ -887,7 +887,6 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
     return img_annotation_box, rect
 def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
     '''
     Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
@@ -932,23 +931,6 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
     return img_annotation_box, rect
-# def set_cropbox_safely(page, original_cropbox):
-#     """
-#     Sets the cropbox of a page, ensuring it's not larger than the mediabox.
-#     If the original cropbox is larger, the mediabox is used instead.
-#     Args:
-#         page: The PyMuPdf page object.
-#         original_cropbox: The fitz.Rect representing the desired cropbox.
-#     """
-#     mediabox = page.mediabox
-#     if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
-#         #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
-#         page.set_cropbox(mediabox)
-#     else:
-#         page.set_cropbox(original_cropbox)
 def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
     """
     Sets the cropbox of a PyMuPDF page safely and defensively.
@@ -995,7 +977,6 @@ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
     else:
         page.set_cropbox(original_cropbox)
 def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height
@@ -1788,7 +1769,6 @@ def redact_image_pdf(file_path:str,
     return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
 ###
 # PIKEPDF TEXT DETECTION/REDACTION
 ###

     # Call prepare_image_or_pdf only if needed
     if prepare_images_flag is not None:
         out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox = prepare_image_or_pdf(
+            file_paths_loop, text_extraction_method, all_line_level_ocr_results_df, 0, out_message, True,
             annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
             output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
         )
     return img_annotation_box, rect
 def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
     '''
     Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
     return img_annotation_box, rect
 def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
     """
     Sets the cropbox of a PyMuPDF page safely and defensively.
     else:
         page.set_cropbox(original_cropbox)
 def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height
     return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
 ###
 # PIKEPDF TEXT DETECTION/REDACTION
 ###

tools/find_duplicate_pages.py CHANGED Viewed

@@ -4,13 +4,15 @@ import re
 from tools.helper_functions import OUTPUT_FOLDER
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-from typing import List, Tuple
 import gradio as gr
 from gradio import Progress
 from pathlib import Path
 from pymupdf import Document
-from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe
 import en_core_web_lg
 nlp = en_core_web_lg.load()
 similarity_threshold = 0.95
@@ -56,10 +58,11 @@ def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, outp
             grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
         else:
             df['line_number_by_page'] = df.groupby('page').cumcount() + 1
             df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
             df['page'] = df['page'].astype(int)
-            grouped = df.drop('line_number_by_page', axis=1)
         # Add filename column
         grouped['file'] = os.path.basename(file_path)
@@ -405,7 +408,7 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./
     # Return the updated dataframe, the new file list, and clear the preview panes
     return updated_df, new_output_paths, None, None
-def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, duplicates_by_line_or_page_bool:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
     """
     Wrapper function updated to include the 'greedy_match' boolean.
     """
@@ -414,7 +417,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
         return None, None, None
     progress(0, desc="Combining input files...")
-    df_combined, _ = combine_ocr_output_text(files, combine_pages=duplicates_by_line_or_page_bool)
     if df_combined.empty:
         gr.Warning("No data found in the uploaded files.")
@@ -427,7 +430,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
         min_word_count=min_words,
         min_consecutive_pages=int(min_consecutive),
         greedy_match=greedy_match,
-        combine_pages=duplicates_by_line_or_page_bool,
         progress=progress
     )
@@ -476,132 +479,438 @@ def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: g
     return page1_data[['page', 'text']], page2_data[['page', 'text']]
-def apply_whole_page_redactions_from_list(duplicate_page_numbers_df:pd.DataFrame, doc_file_name_with_extension_textbox:str, review_file_state:pd.DataFrame, duplicate_output_paths:list[str], pymupdf_doc:object, page_sizes:list[dict], all_existing_annotations:list[dict]):
     '''
-    Take a list of suggested whole pages to redact and apply it to review file data currently available from an existing PDF under review
     '''
-    # Create a copy of annotations to avoid modifying the original
     all_annotations = all_existing_annotations.copy()
-    if not pymupdf_doc:
-        print("Warning: No document file currently under review. Please upload a document on the 'Review redactions' tab to apply whole page redactions.")
-        raise Warning("No document file currently under review. Please upload a document on the 'Review redactions' tab to apply whole page redactions.")
-        return review_file_state, all_annotations
-    # Initialize list of pages to redact
-    list_whole_pages_to_redact = []
-    # Get list of pages to redact from either dataframe or file
-    if not duplicate_page_numbers_df.empty:
-        list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
-    elif duplicate_output_paths:
-        expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
-        whole_pages_list = pd.DataFrame()  # Initialize empty DataFrame
-        for output_file in duplicate_output_paths:
-            # Note: output_file.name might not be available if output_file is just a string path
-            # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
-            file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
-            if expected_duplicate_pages_to_redact_name in file_name_from_path:
-                whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
-                break
         if not whole_pages_list.empty:
             list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
-    # Convert to set to remove duplicates, then back to list
-    list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
-    if not list_whole_pages_to_redact:
-        # Assuming gr is defined (e.g., gradio)
-        print("No relevant list of whole pages to redact found, returning inputs.")
-        raise Warning("Warning: No relevant list of whole pages to redact found, returning inputs.")
-        return review_file_state, all_existing_annotations
     new_annotations = []
     # Process each page for redaction
     for page in list_whole_pages_to_redact:
         try:
-            page_index = int(page) - 1
-            if page_index < 0 or page_index >= len(pymupdf_doc):
-                print(f"Page {page} is out of bounds for a document with {len(pymupdf_doc)} pages, skipping.")
                 continue
-            pymupdf_page = pymupdf_doc[page_index]
-            # Find the matching page size dictionary
-            page_size = next((size for size in page_sizes if size["page"] == int(page)), None)
-            if not page_size:
-                print(f"Page {page} not found in page_sizes object, skipping.")
                 continue
-            rect_height = page_size["cropbox_height"]
-            rect_width = page_size["cropbox_width"]
-            image = page_size["image_path"] # This `image` likely represents the page identifier
-            # Create the whole page redaction box
-            annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, pymupdf_page, border=0.005, redact_pdf=False)
-            # Find existing annotation for this image/page
-            current_page_existing_boxes_group = next((annot_group for annot_group in all_annotations if annot_group["image"] == image), None)
-            new_annotation_group = {
-                    "image": image,
-                    "boxes": [annotation_box]
-                }
-            if current_page_existing_boxes_group:
-                # Check if we already have a whole page redaction for this page
-                if not any(box["label"] == "Whole page" for box in current_page_existing_boxes_group["boxes"]):
-                    current_page_existing_boxes_group["boxes"].append(annotation_box)
-                else:
-                    # Optional: Print a message if a whole-page redaction already exists for this page
-                    print(f"Whole page redaction for page {page} already exists in annotations, skipping addition.")
-                    pass
-            else:                # Create new annotation entry
-                all_annotations.append(new_annotation_group)
-            new_annotations.append(new_annotation_group)
         except Exception as e:
             print(f"Error processing page {page}: {str(e)}")
             continue
-    # Convert annotations to dataframe and combine with existing review file
     whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
-    # Ensure all required columns are present in both DataFrames before concat
-    # This is a common point of error if DFs have different schemas
     expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
     for col in expected_cols:
-        if col not in review_file_state.columns:
-            review_file_state[col] = None # Or an appropriate default value
-        if col not in whole_page_review_file.columns:
-            whole_page_review_file[col] = None
     review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
-    review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"])
-    # --- Remove duplicate entries from the final DataFrame ---
-    dedup_subset_cols = ['page', 'label', 'text', 'id']
-    # Ensure these columns exist before trying to use them as subset for drop_duplicates
-    if all(col in review_file_out.columns for col in dedup_subset_cols):
-        review_file_out = review_file_out.drop_duplicates(
-            subset=dedup_subset_cols,
-            keep='first' # Keep the first occurrence of a duplicate redaction
-        )
-    else:
-        print(f"Warning: Not all columns required for de-duplication ({dedup_subset_cols}) are present in review_file_out. Skipping specific de-duplication.")
-        # You might want a fallback or to inspect what's missing
-    review_file_out.to_csv(OUTPUT_FOLDER + "review_file_out_after_whole_page.csv")
-    gr.Info("Successfully created whole page redactions. Go to the 'Review redactions' tab to see them.")
-    return review_file_out, all_annotations

 from tools.helper_functions import OUTPUT_FOLDER
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from typing import List, Tuple, Optional, Dict
+from collections import defaultdict
 import gradio as gr
 from gradio import Progress
 from pathlib import Path
 from pymupdf import Document
+from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
 import en_core_web_lg
 nlp = en_core_web_lg.load()
 similarity_threshold = 0.95
             grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
         else:
             df['line_number_by_page'] = df.groupby('page').cumcount() + 1
+            df['original_page'] = df['page']
             df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
             df['page'] = df['page'].astype(int)
+            grouped = df #.drop('line_number_by_page', axis=1)
         # Add filename column
         grouped['file'] = os.path.basename(file_path)
     # Return the updated dataframe, the new file list, and clear the preview panes
     return updated_df, new_output_paths, None, None
+def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
     """
     Wrapper function updated to include the 'greedy_match' boolean.
     """
         return None, None, None
     progress(0, desc="Combining input files...")
+    df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages)
     if df_combined.empty:
         gr.Warning("No data found in the uploaded files.")
         min_word_count=min_words,
         min_consecutive_pages=int(min_consecutive),
         greedy_match=greedy_match,
+        combine_pages=combine_pages,
         progress=progress
     )
     return page1_data[['page', 'text']], page2_data[['page', 'text']]
+def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]:
+    """
+    Finds and returns the size and path information for a specific page.
+    """
+    return next((size for size in page_sizes if size["page"] == page_num), None)
+def add_new_annotations_to_existing_page_annotations(
+    all_annotations: List[Dict],
+    image_path: str,
+    new_annotation_boxes: List[Dict]
+) -> Tuple[List[Dict], Dict]:
+    """
+    Adds a list of new annotation boxes to the annotations for a specific page.
+    If the page already has annotations, it extends the list of boxes. If not,
+    it creates a new entry for the page.
+    Args:
+        all_annotations (List[Dict]): The current list of all annotation groups.
+        image_path (str): The identifier for the image/page.
+        new_annotation_boxes (List[Dict]): A list of new annotation boxes to add.
+    Returns:
+        Tuple[List[Dict], Dict]: A tuple containing:
+            - The updated list of all annotation groups.
+            - The annotation group representing the newly added boxes.
+    """
+    # Find the annotation group for the current page/image
+    current_page_group = next(
+        (annot_group for annot_group in all_annotations if annot_group["image"] == image_path),
+        None
+    )
+    if current_page_group:
+        # Page already has annotations, so extend the list with the new boxes
+        current_page_group["boxes"].extend(new_annotation_boxes)
+    else:
+        # This is the first set of annotations for this page, create a new group
+        new_group = {
+            "image": image_path,
+            "boxes": new_annotation_boxes
+        }
+        all_annotations.append(new_group)
+    # This object represents all annotations that were just added for this page
+    newly_added_annotation_group = {
+        "image": image_path,
+        "boxes": new_annotation_boxes
+    }
+    return all_annotations, newly_added_annotation_group
+def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=[]):
     '''
+    Take a list of suggested whole pages to redact and apply it to review file data.
     '''
     all_annotations = all_existing_annotations.copy()
+    if not pymupdf_doc:
+        message = "No document file currently under review."
+        print(f"Warning: {message}")
+        raise Warning(message)
+    list_whole_pages_to_redact = []
+    if combine_pages == True:
+        # Get list of pages to redact from either dataframe or file
+        if not duplicate_page_numbers_df.empty:
+            list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
+        elif duplicate_output_paths:
+            expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
+            whole_pages_list = pd.DataFrame()  # Initialize empty DataFrame
+            for output_file in duplicate_output_paths:
+                # Note: output_file.name might not be available if output_file is just a string path
+                # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
+                file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
+                if expected_duplicate_pages_to_redact_name in file_name_from_path:
+                    whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
+                    break
+        else:
+            message = "No relevant list of whole pages to redact found."
+            print(message)
+            raise Warning(message)
         if not whole_pages_list.empty:
             list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
+        list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
+    else:
+        if not new_annotations_with_bounding_boxes:
+            message = "Can't find any new annotations to add"
+            print(message)
+            raise Warning(message)
+        list_whole_pages_to_redact = []
+        for annotation in new_annotations_with_bounding_boxes:
+            match = re.search(r'_(\d+)\.png$', annotation["image"])
+            if match:
+                page = int(match.group(1)) + 1
+                list_whole_pages_to_redact.append(page)
+            else:
+                print(f"Warning: Could not extract page number from {annotation['image']}")
+        list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
     new_annotations = []
     # Process each page for redaction
     for page in list_whole_pages_to_redact:
         try:
+            page_num = int(page)
+            page_index = page_num - 1
+            if not (0 <= page_index < len(pymupdf_doc)):
+                print(f"Page {page_num} is out of bounds, skipping.")
                 continue
+            page_info = get_page_image_info(page_num, page_sizes)
+            if not page_info:
+                print(f"Page {page_num} not found in page_sizes, skipping.")
                 continue
+            image_path = page_info["image_path"]
+            page_annotation_group = next((g for g in all_annotations if g["image"] == image_path), None)
+            if page_annotation_group and any(box["label"] == "Whole page" for box in page_annotation_group["boxes"]):
+                print(f"Whole page redaction for page {page_num} already exists, skipping.")
+                continue
+            # --- Create a LIST of boxes to add.---
+            boxes_to_add = []
+            pymupdf_page = pymupdf_doc[page_index]
+            if combine_pages==True:
+                whole_page_box = redact_whole_pymupdf_page(
+                    rect_height=page_info["cropbox_height"],
+                    rect_width=page_info["cropbox_width"],
+                    page=pymupdf_page, border=0.005, redact_pdf=False
+                )
+                boxes_to_add.append(whole_page_box)
+            else:
+                # Find the specific annotation group that matches the current page's image path
+                relevant_box_group = next(
+                    (group for group in new_annotations_with_bounding_boxes if group.get('image') == image_path),
+                    None  # Default to None if no match is found
+                )
+                # Check if we found a matching group of boxes for this page
+                if relevant_box_group:
+                    boxes_to_add.extend(relevant_box_group['boxes'])
+                else:
+                    # This case would be unexpected, but it's good to handle.
+                    # It means a page was in list_whole_pages_to_redact but had no
+                    # corresponding boxes generated in new_annotations_with_bounding_boxes.
+                    print(f"Warning: No new annotation boxes found for page {page_num} ({image_path}).")
+            # === Use the modified helper function to add a LIST of boxes ===
+            all_annotations, new_annotations_for_page = add_new_annotations_to_existing_page_annotations(
+                all_annotations=all_annotations,
+                image_path=image_path,
+                new_annotation_boxes=boxes_to_add  # Pass the list here
+            )
+            new_annotations_for_page = fill_missing_box_ids_each_box(new_annotations_for_page)
+            new_annotations.append(new_annotations_for_page)
         except Exception as e:
             print(f"Error processing page {page}: {str(e)}")
             continue
     whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
+    if whole_page_review_file.empty:
+        message = "No new whole page redactions were added."
+        print(message)
+        gr.Info(message)
+        return review_file_state, all_annotations
     expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
     for col in expected_cols:
+        if col not in review_file_state.columns: review_file_state[col] = pd.NA
+        if col not in whole_page_review_file.columns: whole_page_review_file[col] = pd.NA
     review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
+    review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
+    review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
+    out_message = "Successfully created whole page redactions."
+    print(out_message)
+    gr.Info(out_message)
+    return review_file_out, all_annotations
+# --- 1. Helper Function to Parse the Combined Page/Line ID ---
+def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
+    """
+    Parses a combined page and line number ID into a (page, line) tuple.
+    Assumes the ID is a 10-digit number where the first 5 are the page
+    and the last 5 are the line number.
+    Example: 100027 -> (1, 27)
+             200005 -> (2, 5)
+    """
+    # zfill ensures the string is padded with leading zeros to 10 characters
+    s_id = str(combined_id).zfill(10)
+    page = int(s_id[:5])
+    line = int(s_id[5:])
+    return page, line
+# def create_annotations_from_ocr_outputs(ocr_results_df_lines_to_annotate:pd.DataFrame):
+#     '''
+#     Create a set of annotation boxes based on selected ocr_results_df lines.
+#     '''
+#     annotations_by_page = []
+#     # --- Build Annotation Boxes for each selected line ---
+#     for _, line_row in ocr_results_df_lines_to_annotate.iterrows():
+#         # The coordinates are relative, so xmax = left + width and ymax = top + height
+#         box = {
+#             "label": "Similar Text", # Or any other label you prefer
+#             "xmin": line_row['left'],
+#             "ymin": line_row['top'] + line_row['height'],
+#             "xmax": line_row['left'] + line_row['width'],
+#             "ymax": line_row['top'] ,
+#             "text": line_row['text']
+#         }
+#         # --- 6. Group the box by its page number ---
+#         page_number = line_row['page']
+#         annotations_by_page[page_number].append(box)
+#     return annotations_by_page
+# def create_annotation_objects_from_duplicates(
+#     duplicates_df: pd.DataFrame,
+#     ocr_results_df: pd.DataFrame,
+#     combine_pages:bool=False
+# ) -> List[Dict]:
+#     """
+#     Creates structured annotation objects from selected ocr outputs.
+#     Args:
+#         duplicates_df (pd.DataFrame): DataFrame containing duplicate ranges with
+#                                       columns like 'Page2_Start_Page' and 'Page2_End_Page'.
+#         ocr_results_df (pd.DataFrame): DataFrame with OCR results, including columns
+#                                        'page', 'text', 'left', 'top', 'width', 'height'.
+#     Returns:
+#         List[Dict]: A list of dictionaries, where each dict represents a page and its
+#                     list of annotation boxes, in the format:
+#                     [{"page": 1, "boxes": [...]}, {"page": 2, "boxes": [...]}]
+#     """
+#     annotations_by_page = []
+#     if combine_pages == False:
+#         # --- 2. Prepare OCR Data: Add a line number column if it doesn't exist ---
+#         if 'line_number_by_page' not in ocr_results_df.columns:
+#             print("Generating 'line_number_by_page' for ocr_results_df...")
+#             # Sort by page and original position to ensure correct line numbering
+#             ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
+#             ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
+#         # Use defaultdict to easily append to lists for each page
+#         annotations_by_page = defaultdict(list)
+#         # --- 3. Iterate through each duplicate range ---
+#         for _, row in duplicates_df.iterrows():
+#             # Parse the start and end page/line numbers from the duplicate row
+#             start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
+#             end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
+#             # --- 4. Select OCR Lines based on the range ---
+#             # This logic correctly handles ranges within a single page and across multiple pages
+#             if start_page == end_page:
+#                 # Simple case: the range is on a single page
+#                 condition = (
+#                     (ocr_results_df['page'] == start_page) &
+#                     (ocr_results_df['line_number_by_page'].between(start_line, end_line))
+#                 )
+#             else:
+#                 # Complex case: the range spans multiple pages
+#                 # Condition for the first page in the range
+#                 cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
+#                 # Condition for all pages between the start and end
+#                 cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
+#                 # Condition for the last page in the range
+#                 cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
+#                 condition = cond_start | cond_middle | cond_end
+#             lines_to_annotate = ocr_results_df[condition]
+#             annotations_by_page = create_annotations_from_ocr_outputs(lines_to_annotate)
+#         # --- Format the final output list ---
+#         final_output = []
+#         # Sort by page number for a predictable order
+#         for page, boxes in sorted(annotations_by_page.items()):
+#             final_output.append({
+#                 "page": page,
+#                 "boxes": boxes
+#             })
+#     return final_output
+def create_annotation_objects_from_duplicates(
+    duplicates_df: pd.DataFrame,
+    ocr_results_df: pd.DataFrame,
+    page_sizes: List[Dict],
+    combine_pages:bool=False
+) -> List[Dict]:
+    """
+    Creates structured annotation objects from duplicate line ranges, mapping
+    page numbers to image paths.
+    Args:
+        duplicates_df (pd.DataFrame): DataFrame with duplicate ranges.
+        ocr_results_df (pd.DataFrame): DataFrame with OCR results.
+        page_sizes (List[Dict]): A list of dictionaries mapping page numbers to image paths and other metadata. Expected format: [{"page": 1, "image_path": "path/to/img.png", ...}]
+        combine_pages (bool): A boolean that determines whether in previous functions, all text from a page was combined (True). This function will only run if this is False.
+    Returns:
+        List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
+    """
+    final_output = []
+    if combine_pages == False:
+        # --- NEW: Create an efficient lookup map from page number to image path ---
+        page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
+        # Prepare OCR Data: Add a line number column if it doesn't exist
+        if 'line_number_by_page' not in ocr_results_df.columns:
+            ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
+            ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
+        annotations_by_page = defaultdict(list)
+        # Iterate through each duplicate range (this logic is unchanged)
+        for _, row in duplicates_df.iterrows():
+            start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
+            end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
+            # Select OCR Lines based on the range (this logic is unchanged)
+            if start_page == end_page:
+                condition = (
+                    (ocr_results_df['page'] == start_page) &
+                    (ocr_results_df['line_number_by_page'].between(start_line, end_line))
+                )
+            else:
+                cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
+                cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
+                cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
+                condition = cond_start | cond_middle | cond_end
+            lines_to_annotate = ocr_results_df[condition]
+            # Build and group annotation boxes by page number (this logic is unchanged)
+            for _, line_row in lines_to_annotate.iterrows():
+                box = {
+                    "label": "Duplicate text",
+                    "color": (0,0,0),
+                    "xmin": line_row['left'],
+                    "ymin": line_row['top'],
+                    "xmax": line_row['left'] + line_row['width'],
+                    "ymax": line_row['top'] + line_row['height'],
+                    "text": line_row['text'],
+                    "id": "" # to be filled in after
+                }
+                page_number = line_row['page']
+                annotations_by_page[page_number].append(box)
+        print("annotations_by_page:", annotations_by_page)
+        # --- Format the final output list using the page-to-image map ---
+        final_output = []
+        # Sort by page number for a predictable order
+        for page_num, boxes in sorted(annotations_by_page.items()):
+            # Look up the image path using the page number
+            image_path = page_to_image_map.get(page_num)
+            if image_path:
+                page_boxes = {
+                    "image": image_path,
+                    "boxes": boxes
+                }
+                # Fill in missing IDs for the new data entries
+                page_boxes = fill_missing_box_ids_each_box(page_boxes)
+                # Add the annotation group using 'image' as the key
+                final_output.append(page_boxes)
+            else:
+                # Handle cases where a page might not have a corresponding image path
+                print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
+                    f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
+    print("final_output:", final_output)
+    return final_output
+# --- Example Usage ---
+# 1. Create your example DataFrames
+# duplicates_data = {
+#     'Page1_File': ['doc_a.csv'],
+#     'Page1_Start_Page': [100009],
+#     'Page1_End_Page': [100021],
+#     'Page2_File': ['doc_a.csv'],
+#     'Page2_Start_Page': [100027], # Page 1, Line 27
+#     'Page2_End_Page': [200005],   # Page 2, Line 5
+# }
+# duplicates_df = pd.DataFrame(duplicates_data)
+# ocr_data = {
+#     'page': [1]*30 + [2]*10, # 30 lines on page 1, 10 on page 2
+#     'text': [f"Text on page {p}, line {l}" for p in [1, 2] for l in range(1, (31 if p==1 else 11))],
+#     # Example coordinates (using small, consistent values for demonstration)
+#     'left': [0.1] * 40,
+#     'top': [i*0.02 for i in range(30)] + [i*0.02 for i in range(10)],
+#     'width': [0.8] * 40,
+#     'height': [0.015] * 40,
+# }
+# ocr_results_df = pd.DataFrame(ocr_data)
+# # 2. Run the function
+# generated_annotations = create_annotation_objects_from_duplicates(duplicates_df, ocr_results_df)
+# # 3. Print the result
+# import json
+# print(json.dumps(generated_annotations, indent=2))

tools/helper_functions.py CHANGED Viewed

@@ -89,6 +89,16 @@ def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str,
                                     ]
     return cost_code_df
 def update_dataframe(df:pd.DataFrame):
     df_copy = df.copy()
     return df_copy

                                     ]
     return cost_code_df
+def ensure_folder_exists(output_folder:str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
 def update_dataframe(df:pd.DataFrame):
     df_copy = df.copy()
     return df_copy

tools/redaction_review.py CHANGED Viewed

@@ -1429,7 +1429,6 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List
     return output_paths
 ### Convert xfdf coordinates back to image for app
 def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):

     return output_paths
 ### Convert xfdf coordinates back to image for app
 def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):