seanpedrickcase commited on
Commit
c8ffcd4
·
1 Parent(s): e424038

Further updates to line level duplicate identification

Browse files
app.py CHANGED
@@ -2,8 +2,8 @@ import os
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME
6
- from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
@@ -12,12 +12,22 @@ from tools.data_anonymise import anonymise_data_files
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
- from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
17
 
18
  # Suppress downcasting warnings
19
  pd.set_option('future.no_silent_downcasting', True)
20
 
 
 
 
 
 
 
 
 
 
 
21
  # Convert string environment variables to string or list
22
  if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
23
  else: SAVE_LOGS_TO_CSV = False
@@ -89,15 +99,15 @@ with app:
89
  # Backup versions of these objects in case you make a mistake
90
  backup_review_state = gr.Dataframe(visible=False)
91
  backup_image_annotations_state = gr.State([])
92
- backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
93
 
94
  # Logging variables
95
  access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
96
- access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
97
  feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
98
- feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
99
  usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
100
- usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
101
 
102
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
103
  textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
@@ -164,7 +174,7 @@ with app:
164
 
165
  load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
166
  s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
167
- local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
168
 
169
  s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
170
  default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
@@ -173,9 +183,8 @@ with app:
173
 
174
  # Base tables that are not modified subsequent to load
175
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
176
- all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
177
- all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
178
- cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
179
 
180
  # Placeholder for selected entity dataframe row
181
  selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
@@ -197,6 +206,7 @@ with app:
197
  page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
198
 
199
  # Placeholders for elements that may be made visible later below depending on environment variables
 
200
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
201
  cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
202
 
@@ -226,6 +236,9 @@ with app:
226
  textract_job_output_file = gr.File(label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False)
227
  convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
228
 
 
 
 
229
  ###
230
  # UI DESIGN
231
  ###
@@ -408,7 +421,7 @@ with app:
408
  with gr.Row():
409
  duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
410
  min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
411
- duplicates_by_line_or_page_bool = gr.Checkbox(value=True, label="Analyse duplicate text by page (off for by line)")
412
 
413
  gr.Markdown("#### Matching Strategy")
414
  greedy_match_input = gr.Checkbox(
@@ -588,7 +601,7 @@ with app:
588
  cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
589
 
590
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
591
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
592
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
593
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
594
 
@@ -627,7 +640,7 @@ with app:
627
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
628
 
629
  convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
630
- success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
631
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
632
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
633
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
@@ -644,7 +657,7 @@ with app:
644
  # Upload previous files for modifying redactions
645
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
646
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
647
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
648
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
649
 
650
  # Manual updates to review di
@@ -725,6 +738,7 @@ with app:
725
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
726
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
727
 
 
728
  undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
729
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
730
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
@@ -734,16 +748,17 @@ with app:
734
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
735
  success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
736
 
 
737
  reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
738
 
739
  # Convert review file to xfdf Adobe format
740
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
741
- success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
742
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
743
 
744
  # Convert xfdf Adobe file back to review_file.csv
745
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
746
- success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
747
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
748
 
749
  ###
@@ -764,6 +779,8 @@ with app:
764
  ###
765
  # IDENTIFY DUPLICATE PAGES
766
  ###
 
 
767
  find_duplicate_pages_btn.click(
768
  fn=run_duplicate_analysis,
769
  inputs=[
@@ -772,7 +789,7 @@ with app:
772
  min_word_count_input,
773
  min_consecutive_pages_input,
774
  greedy_match_input,
775
- duplicates_by_line_or_page_bool
776
  ],
777
  outputs=[
778
  results_df_preview,
@@ -795,9 +812,9 @@ with app:
795
  outputs=[results_df_preview, duplicate_files_out, page1_text_preview, page2_text_preview]
796
  )
797
 
798
- apply_match_btn.click(
799
- fn=apply_whole_page_redactions_from_list,
800
- inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state],
801
  outputs=[review_file_df, all_image_annotations_state]).\
802
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
803
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME, CONFIG_FOLDER, GRADIO_TEMP_DIR, MPLCONFIGDIR, S3_FEEDBACK_LOGS_FOLDER, S3_ACCESS_LOGS_FOLDER, S3_USAGE_LOGS_FOLDER
6
+ from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list, ensure_folder_exists
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 
12
  from tools.auth import authenticate_user
13
  from tools.load_spacy_model_custom_recognisers import custom_entities
14
  from tools.custom_csvlogger import CSVLogger_custom
15
+ from tools.find_duplicate_pages import run_duplicate_analysis, exclude_match, handle_selection_and_preview, apply_whole_page_redactions_from_list, create_annotation_objects_from_duplicates
16
  from tools.textract_batch_call import analyse_document_with_textract_api, poll_whole_document_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist, replace_existing_pdf_input_for_whole_document_outputs
17
 
18
  # Suppress downcasting warnings
19
  pd.set_option('future.no_silent_downcasting', True)
20
 
21
+ # Ensure that output folders exist
22
+ ensure_folder_exists(CONFIG_FOLDER)
23
+ ensure_folder_exists(OUTPUT_FOLDER)
24
+ ensure_folder_exists(INPUT_FOLDER)
25
+ ensure_folder_exists(GRADIO_TEMP_DIR)
26
+ ensure_folder_exists(MPLCONFIGDIR)
27
+ ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
28
+ ensure_folder_exists(ACCESS_LOGS_FOLDER)
29
+ ensure_folder_exists(USAGE_LOGS_FOLDER)
30
+
31
  # Convert string environment variables to string or list
32
  if SAVE_LOGS_TO_CSV == "True": SAVE_LOGS_TO_CSV = True
33
  else: SAVE_LOGS_TO_CSV = False
 
99
  # Backup versions of these objects in case you make a mistake
100
  backup_review_state = gr.Dataframe(visible=False)
101
  backup_image_annotations_state = gr.State([])
102
+ backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
103
 
104
  # Logging variables
105
  access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
106
+ access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=S3_ACCESS_LOGS_FOLDER, visible=False)
107
  feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
108
+ feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=S3_FEEDBACK_LOGS_FOLDER, visible=False)
109
  usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
110
+ usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=S3_USAGE_LOGS_FOLDER, visible=False)
111
 
112
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
113
  textract_metadata_textbox = gr.Textbox(label = "textract_metadata_textbox", value="", visible=False)
 
174
 
175
  load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
176
  s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
177
+ local_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
178
 
179
  s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
180
  default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
 
183
 
184
  # Base tables that are not modified subsequent to load
185
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
186
+ all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page","text", "left","top","width","height"], row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
187
+ all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
 
188
 
189
  # Placeholder for selected entity dataframe row
190
  selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
 
206
  page_break_return = gr.Checkbox(value = False, label="Page break reached", visible=False)
207
 
208
  # Placeholders for elements that may be made visible later below depending on environment variables
209
+ cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
210
  cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
211
  cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
212
 
 
236
  textract_job_output_file = gr.File(label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False)
237
  convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
238
 
239
+ ## Duplicate search object
240
+ new_duplicate_search_annotation_object = gr.Dropdown(value=None, label="new_duplicate_search_annotation_object", allow_custom_value=True, visible=False)
241
+
242
  ###
243
  # UI DESIGN
244
  ###
 
421
  with gr.Row():
422
  duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
423
  min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
424
+ combine_page_text_for_duplicates_bool = gr.Checkbox(value=True, label="Analyse duplicate text by page (off for by line)")
425
 
426
  gr.Markdown("#### Matching Strategy")
427
  greedy_match_input = gr.Checkbox(
 
601
  cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
602
 
603
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
604
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
605
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
606
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox])
607
 
 
640
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
641
 
642
  convert_textract_outputs_to_ocr_results.click(replace_existing_pdf_input_for_whole_document_outputs, inputs = [s3_whole_document_textract_input_subfolder, doc_file_name_no_extension_textbox, output_folder_textbox, s3_whole_document_textract_default_bucket, in_doc_files, input_folder_textbox], outputs = [in_doc_files, doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
643
+ success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox]).\
644
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
645
  success(fn=check_for_existing_local_ocr_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[local_ocr_output_found_checkbox]).\
646
  success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
 
657
  # Upload previous files for modifying redactions
658
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
659
  success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
660
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox], api_name="prepare_doc").\
661
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
662
 
663
  # Manual updates to review di
 
738
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df]).\
739
  success(update_all_entity_df_dropdowns, inputs=[recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown])
740
 
741
+ # Undo last redaction exclusion action
742
  undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_df, all_image_annotations_state, recogniser_entity_dataframe_base]).\
743
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
744
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
 
748
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_ocr_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
749
  success(increase_bottom_page_count_based_on_top, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom])
750
 
751
+ # Reset the OCR results filter
752
  reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
753
 
754
  # Convert review file to xfdf Adobe format
755
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
756
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
757
  success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
758
 
759
  # Convert xfdf Adobe file back to review_file.csv
760
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
761
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder, local_ocr_output_found_checkbox]).\
762
  success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
763
 
764
  ###
 
779
  ###
780
  # IDENTIFY DUPLICATE PAGES
781
  ###
782
+ #in_duplicate_pages.upload(fn = prepare_image_or_pdf, inputs=[in_duplicate_pages, text_extract_method_radio, all_line_level_ocr_results_df_base, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_df, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base, local_ocr_output_found_checkbox])
783
+
784
  find_duplicate_pages_btn.click(
785
  fn=run_duplicate_analysis,
786
  inputs=[
 
789
  min_word_count_input,
790
  min_consecutive_pages_input,
791
  greedy_match_input,
792
+ combine_page_text_for_duplicates_bool
793
  ],
794
  outputs=[
795
  results_df_preview,
 
812
  outputs=[results_df_preview, duplicate_files_out, page1_text_preview, page2_text_preview]
813
  )
814
 
815
+ apply_match_btn.click(fn=create_annotation_objects_from_duplicates, inputs=[results_df_preview, all_line_level_ocr_results_df_base, page_sizes, combine_page_text_for_duplicates_bool], outputs=[new_duplicate_search_annotation_object]).\
816
+ success(fn=apply_whole_page_redactions_from_list,
817
+ inputs=[in_fully_redacted_list_state, doc_file_name_with_extension_textbox, review_file_df, duplicate_files_out, pdf_doc_state, page_sizes, all_image_annotations_state, combine_page_text_for_duplicates_bool, new_duplicate_search_annotation_object],
818
  outputs=[review_file_df, all_image_annotations_state]).\
819
  success(update_annotator_page_from_review_df, inputs=[review_file_df, images_pdf_state, page_sizes, all_image_annotations_state, annotator, selected_entity_dataframe_row, input_folder_textbox, doc_full_file_name_textbox], outputs=[annotator, all_image_annotations_state, annotate_current_page, page_sizes, review_file_df, annotate_previous_page]).\
820
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
tools/config.py CHANGED
@@ -28,16 +28,6 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
28
 
29
  return value
30
 
31
- def ensure_folder_exists(output_folder:str):
32
- """Checks if the specified folder exists, creates it if not."""
33
-
34
- if not os.path.exists(output_folder):
35
- # Create the folder if it doesn't exist
36
- os.makedirs(output_folder, exist_ok=True)
37
- print(f"Created the {output_folder} folder.")
38
- else:
39
- print(f"The {output_folder} folder already exists.")
40
-
41
  def add_folder_to_path(folder_path: str):
42
  '''
43
  Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
@@ -59,15 +49,12 @@ def add_folder_to_path(folder_path: str):
59
  else:
60
  print(f"Folder not found at {folder_path} - not added to PATH")
61
 
62
-
63
  ###
64
  # LOAD CONFIG FROM ENV FILE
65
  ###
66
 
67
  CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
68
 
69
- ensure_folder_exists(CONFIG_FOLDER)
70
-
71
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
72
  APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
73
 
@@ -115,9 +102,6 @@ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
115
  # Retrieving or setting CUSTOM_HEADER_VALUE
116
  CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
117
 
118
-
119
-
120
-
121
  ###
122
  # Image options
123
  ###
@@ -134,9 +118,6 @@ SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
134
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
135
  INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
136
 
137
- ensure_folder_exists(OUTPUT_FOLDER)
138
- ensure_folder_exists(INPUT_FOLDER)
139
-
140
  # Allow for files to be saved in a temporary folder for increased security in some instances
141
  if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
142
  # Create a temporary directory
@@ -146,13 +127,9 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
146
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
147
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
148
 
149
-
150
  GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
151
  MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
152
 
153
- ensure_folder_exists(GRADIO_TEMP_DIR)
154
- ensure_folder_exists(MPLCONFIGDIR)
155
-
156
  ###
157
  # LOGGING OPTIONS
158
  ###
@@ -164,32 +141,33 @@ SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
164
 
165
  USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
166
 
 
 
 
 
167
  if USE_LOG_SUBFOLDERS == "True":
168
  day_log_subfolder = today_rev + '/'
169
  host_name_subfolder = HOST_NAME + '/'
170
  full_log_subfolder = day_log_subfolder + host_name_subfolder
171
- else:
172
- full_log_subfolder = ""
173
 
174
- FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
175
- ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
176
- USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
177
 
178
- ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
179
- ensure_folder_exists(ACCESS_LOGS_FOLDER)
180
- ensure_folder_exists(USAGE_LOGS_FOLDER)
 
181
 
182
  # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
183
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
184
 
185
  # Further customisation options for CSV logs
186
-
187
  CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
188
  CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
189
  CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
190
 
191
  ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
192
-
193
  SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
194
 
195
  ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
@@ -213,7 +191,6 @@ USAGE_LOG_FILE_NAME = get_or_create_env_var('USAGE_LOG_FILE_NAME', LOG_FILE_NAME
213
  FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
214
 
215
 
216
-
217
  ###
218
  # REDACTION OPTIONS
219
  ###
 
28
 
29
  return value
30
 
 
 
 
 
 
 
 
 
 
 
31
  def add_folder_to_path(folder_path: str):
32
  '''
33
  Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 
49
  else:
50
  print(f"Folder not found at {folder_path} - not added to PATH")
51
 
 
52
  ###
53
  # LOAD CONFIG FROM ENV FILE
54
  ###
55
 
56
  CONFIG_FOLDER = get_or_create_env_var('CONFIG_FOLDER', 'config/')
57
 
 
 
58
  # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
59
  APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', CONFIG_FOLDER + 'app_config.env') # e.g. config/app_config.env
60
 
 
102
  # Retrieving or setting CUSTOM_HEADER_VALUE
103
  CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
104
 
 
 
 
105
  ###
106
  # Image options
107
  ###
 
118
  OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
119
  INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
120
 
 
 
 
121
  # Allow for files to be saved in a temporary folder for increased security in some instances
122
  if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
123
  # Create a temporary directory
 
127
  if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
128
  if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
129
 
 
130
  GRADIO_TEMP_DIR = get_or_create_env_var('GRADIO_TEMP_DIR', 'tmp/gradio_tmp/') # Default Gradio temp folder
131
  MPLCONFIGDIR = get_or_create_env_var('MPLCONFIGDIR', 'tmp/matplotlib_cache/') # Matplotlib cache folder
132
 
 
 
 
133
  ###
134
  # LOGGING OPTIONS
135
  ###
 
141
 
142
  USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
143
 
144
+ FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/')
145
+ ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/')
146
+ USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/')
147
+
148
  if USE_LOG_SUBFOLDERS == "True":
149
  day_log_subfolder = today_rev + '/'
150
  host_name_subfolder = HOST_NAME + '/'
151
  full_log_subfolder = day_log_subfolder + host_name_subfolder
 
 
152
 
153
+ FEEDBACK_LOGS_FOLDER = FEEDBACK_LOGS_FOLDER + full_log_subfolder
154
+ ACCESS_LOGS_FOLDER = ACCESS_LOGS_FOLDER + full_log_subfolder
155
+ USAGE_LOGS_FOLDER = USAGE_LOGS_FOLDER + full_log_subfolder
156
 
157
+
158
+ S3_FEEDBACK_LOGS_FOLDER = get_or_create_env_var('S3_FEEDBACK_LOGS_FOLDER', FEEDBACK_LOGS_FOLDER)
159
+ S3_ACCESS_LOGS_FOLDER = get_or_create_env_var('S3_ACCESS_LOGS_FOLDER', ACCESS_LOGS_FOLDER)
160
+ S3_USAGE_LOGS_FOLDER = get_or_create_env_var('S3_USAGE_LOGS_FOLDER', USAGE_LOGS_FOLDER)
161
 
162
  # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
163
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
164
 
165
  # Further customisation options for CSV logs
 
166
  CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
167
  CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
168
  CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
169
 
170
  ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
 
171
  SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
172
 
173
  ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
 
191
  FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
192
 
193
 
 
194
  ###
195
  # REDACTION OPTIONS
196
  ###
tools/file_conversion.py CHANGED
@@ -455,6 +455,7 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
455
  def prepare_image_or_pdf(
456
  file_paths: List[str],
457
  in_redact_method: str,
 
458
  latest_file_completed: int = 0,
459
  out_message: List[str] = [],
460
  first_loop_state: bool = False,
@@ -506,7 +507,6 @@ def prepare_image_or_pdf(
506
  pymupdf_doc = []
507
  all_img_details = []
508
  review_file_csv = pd.DataFrame()
509
- all_line_level_ocr_results_df = pd.DataFrame()
510
  out_textract_path = ""
511
  combined_out_message = ""
512
  final_out_message = ""
@@ -1289,6 +1289,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1289
  '''
1290
  if not all_annotations:
1291
  # Return an empty DataFrame with the expected schema if input is empty
 
1292
  return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
1293
 
1294
  # 1. Create initial DataFrame from the list of annotations
@@ -1302,7 +1303,6 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1302
  else []
1303
  for anno in all_annotations
1304
  ]
1305
-
1306
  })
1307
 
1308
  # 2. Calculate the page number using the helper function
@@ -1718,6 +1718,75 @@ def fill_missing_box_ids(data_input: dict) -> dict:
1718
  # The input dictionary 'data_input' has been modified in place
1719
  return data_input
1720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1721
  def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
1722
  """
1723
  Optimized: Generates unique alphanumeric IDs for rows in a DataFrame column
 
455
  def prepare_image_or_pdf(
456
  file_paths: List[str],
457
  in_redact_method: str,
458
+ all_line_level_ocr_results_df:pd.DataFrame,
459
  latest_file_completed: int = 0,
460
  out_message: List[str] = [],
461
  first_loop_state: bool = False,
 
507
  pymupdf_doc = []
508
  all_img_details = []
509
  review_file_csv = pd.DataFrame()
 
510
  out_textract_path = ""
511
  combined_out_message = ""
512
  final_out_message = ""
 
1289
  '''
1290
  if not all_annotations:
1291
  # Return an empty DataFrame with the expected schema if input is empty
1292
+ print("No annotations found, returning empty dataframe")
1293
  return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
1294
 
1295
  # 1. Create initial DataFrame from the list of annotations
 
1303
  else []
1304
  for anno in all_annotations
1305
  ]
 
1306
  })
1307
 
1308
  # 2. Calculate the page number using the helper function
 
1718
  # The input dictionary 'data_input' has been modified in place
1719
  return data_input
1720
 
1721
+ def fill_missing_box_ids_each_box(data_input: Dict) -> Dict:
1722
+ """
1723
+ Generates unique alphanumeric IDs for bounding boxes in a list
1724
+ where the 'id' is missing, blank, or not a 12-character string.
1725
+
1726
+ Args:
1727
+ data_input (Dict): The input dictionary containing 'image' and 'boxes' keys.
1728
+ 'boxes' should be a list of dictionaries, each potentially
1729
+ with an 'id' key.
1730
+
1731
+ Returns:
1732
+ Dict: The input dictionary with missing/invalid box IDs filled.
1733
+ Note: The function modifies the input dictionary in place.
1734
+ """
1735
+ # --- Input Validation ---
1736
+ if not isinstance(data_input, dict):
1737
+ raise TypeError("Input 'data_input' must be a dictionary.")
1738
+ if 'boxes' not in data_input or not isinstance(data_input.get('boxes'), list):
1739
+ # If there are no boxes, there's nothing to do.
1740
+ return data_input
1741
+
1742
+ boxes_list = data_input['boxes']
1743
+ id_length = 12
1744
+ character_set = string.ascii_letters + string.digits
1745
+
1746
+ # --- 1. Get ALL Existing IDs to Ensure Uniqueness ---
1747
+ # Collect all valid existing IDs from the entire list first.
1748
+ existing_ids = set()
1749
+ for box in boxes_list:
1750
+ if isinstance(box, dict):
1751
+ box_id = box.get('id')
1752
+ if isinstance(box_id, str) and len(box_id) == id_length:
1753
+ existing_ids.add(box_id)
1754
+
1755
+ # --- 2. Iterate and Fill IDs for each box ---
1756
+ generated_ids_this_run = set() # Keep track of IDs generated in this run
1757
+ num_filled = 0
1758
+
1759
+ for box in boxes_list:
1760
+ if not isinstance(box, dict):
1761
+ continue # Skip items in the list that are not dictionaries
1762
+
1763
+ box_id = box.get('id')
1764
+
1765
+ # Check if this specific box needs a new ID
1766
+ needs_new_id = (
1767
+ box_id is None or
1768
+ not isinstance(box_id, str) or
1769
+ box_id.strip() == "" or
1770
+ len(box_id) != id_length
1771
+ )
1772
+
1773
+ if needs_new_id:
1774
+ # Generate a truly unique ID
1775
+ while True:
1776
+ candidate_id = ''.join(random.choices(character_set, k=id_length))
1777
+ # Check against original IDs and newly generated IDs
1778
+ if candidate_id not in existing_ids and candidate_id not in generated_ids_this_run:
1779
+ generated_ids_this_run.add(candidate_id)
1780
+ box['id'] = candidate_id # Assign the ID to the individual box
1781
+ num_filled += 1
1782
+ break # Move to the next box
1783
+
1784
+ if num_filled > 0:
1785
+ print(f"Successfully filled {num_filled} missing or invalid box IDs.")
1786
+
1787
+ # The input dictionary 'data_input' has been modified in place
1788
+ return data_input
1789
+
1790
  def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
1791
  """
1792
  Optimized: Generates unique alphanumeric IDs for rows in a DataFrame column
tools/file_redaction.py CHANGED
@@ -287,7 +287,7 @@ def choose_and_run_redactor(file_paths:List[str],
287
  # Call prepare_image_or_pdf only if needed
288
  if prepare_images_flag is not None:
289
  out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox = prepare_image_or_pdf(
290
- file_paths_loop, text_extraction_method, 0, out_message, True,
291
  annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
292
  output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
293
  )
@@ -887,7 +887,6 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
887
 
888
  return img_annotation_box, rect
889
 
890
-
891
  def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
892
  '''
893
  Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
@@ -932,23 +931,6 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
932
 
933
  return img_annotation_box, rect
934
 
935
- # def set_cropbox_safely(page, original_cropbox):
936
- # """
937
- # Sets the cropbox of a page, ensuring it's not larger than the mediabox.
938
- # If the original cropbox is larger, the mediabox is used instead.
939
-
940
- # Args:
941
- # page: The PyMuPdf page object.
942
- # original_cropbox: The fitz.Rect representing the desired cropbox.
943
- # """
944
- # mediabox = page.mediabox
945
- # if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
946
- # #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
947
- # page.set_cropbox(mediabox)
948
- # else:
949
- # page.set_cropbox(original_cropbox)
950
-
951
-
952
  def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
953
  """
954
  Sets the cropbox of a PyMuPDF page safely and defensively.
@@ -995,7 +977,6 @@ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
995
  else:
996
  page.set_cropbox(original_cropbox)
997
 
998
-
999
  def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
1000
 
1001
  rect_height = page.rect.height
@@ -1788,7 +1769,6 @@ def redact_image_pdf(file_path:str,
1788
 
1789
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1790
 
1791
-
1792
  ###
1793
  # PIKEPDF TEXT DETECTION/REDACTION
1794
  ###
 
287
  # Call prepare_image_or_pdf only if needed
288
  if prepare_images_flag is not None:
289
  out_message, prepared_pdf_file_paths, pdf_image_file_paths, annotate_max_pages, annotate_max_pages_bottom, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes, textract_output_found, all_img_details_state, placeholder_ocr_results_df, local_ocr_output_found_checkbox = prepare_image_or_pdf(
290
+ file_paths_loop, text_extraction_method, all_line_level_ocr_results_df, 0, out_message, True,
291
  annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list,
292
  output_folder, prepare_images=prepare_images_flag, page_sizes=page_sizes, input_folder=input_folder
293
  )
 
887
 
888
  return img_annotation_box, rect
889
 
 
890
  def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
891
  '''
892
  Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
 
931
 
932
  return img_annotation_box, rect
933
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
934
  def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
935
  """
936
  Sets the cropbox of a PyMuPDF page safely and defensively.
 
977
  else:
978
  page.set_cropbox(original_cropbox)
979
 
 
980
  def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
981
 
982
  rect_height = page.rect.height
 
1769
 
1770
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1771
 
 
1772
  ###
1773
  # PIKEPDF TEXT DETECTION/REDACTION
1774
  ###
tools/find_duplicate_pages.py CHANGED
@@ -4,13 +4,15 @@ import re
4
  from tools.helper_functions import OUTPUT_FOLDER
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
- from typing import List, Tuple
 
8
  import gradio as gr
9
  from gradio import Progress
10
  from pathlib import Path
11
  from pymupdf import Document
12
- from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe
13
  import en_core_web_lg
 
14
  nlp = en_core_web_lg.load()
15
 
16
  similarity_threshold = 0.95
@@ -56,10 +58,11 @@ def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, outp
56
  grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
57
  else:
58
  df['line_number_by_page'] = df.groupby('page').cumcount() + 1
 
59
  df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
60
  df['page'] = df['page'].astype(int)
61
 
62
- grouped = df.drop('line_number_by_page', axis=1)
63
 
64
  # Add filename column
65
  grouped['file'] = os.path.basename(file_path)
@@ -405,7 +408,7 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./
405
  # Return the updated dataframe, the new file list, and clear the preview panes
406
  return updated_df, new_output_paths, None, None
407
 
408
- def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, duplicates_by_line_or_page_bool:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
409
  """
410
  Wrapper function updated to include the 'greedy_match' boolean.
411
  """
@@ -414,7 +417,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
414
  return None, None, None
415
 
416
  progress(0, desc="Combining input files...")
417
- df_combined, _ = combine_ocr_output_text(files, combine_pages=duplicates_by_line_or_page_bool)
418
 
419
  if df_combined.empty:
420
  gr.Warning("No data found in the uploaded files.")
@@ -427,7 +430,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
427
  min_word_count=min_words,
428
  min_consecutive_pages=int(min_consecutive),
429
  greedy_match=greedy_match,
430
- combine_pages=duplicates_by_line_or_page_bool,
431
  progress=progress
432
  )
433
 
@@ -476,132 +479,438 @@ def show_page_previews(full_data_by_file: dict, results_df: pd.DataFrame, evt: g
476
 
477
  return page1_data[['page', 'text']], page2_data[['page', 'text']]
478
 
479
- def apply_whole_page_redactions_from_list(duplicate_page_numbers_df:pd.DataFrame, doc_file_name_with_extension_textbox:str, review_file_state:pd.DataFrame, duplicate_output_paths:list[str], pymupdf_doc:object, page_sizes:list[dict], all_existing_annotations:list[dict]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  '''
481
- Take a list of suggested whole pages to redact and apply it to review file data currently available from an existing PDF under review
482
  '''
483
- # Create a copy of annotations to avoid modifying the original
484
  all_annotations = all_existing_annotations.copy()
485
 
486
- if not pymupdf_doc:
487
- print("Warning: No document file currently under review. Please upload a document on the 'Review redactions' tab to apply whole page redactions.")
488
- raise Warning("No document file currently under review. Please upload a document on the 'Review redactions' tab to apply whole page redactions.")
489
- return review_file_state, all_annotations
490
 
491
- # Initialize list of pages to redact
492
- list_whole_pages_to_redact = []
493
-
494
- # Get list of pages to redact from either dataframe or file
495
- if not duplicate_page_numbers_df.empty:
496
- list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
497
- elif duplicate_output_paths:
498
- expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
499
- whole_pages_list = pd.DataFrame() # Initialize empty DataFrame
500
-
501
- for output_file in duplicate_output_paths:
502
- # Note: output_file.name might not be available if output_file is just a string path
503
- # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
504
- file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
505
- if expected_duplicate_pages_to_redact_name in file_name_from_path:
506
- whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
507
- break
 
 
 
 
508
 
509
  if not whole_pages_list.empty:
510
  list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
511
-
512
- # Convert to set to remove duplicates, then back to list
513
- list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
514
-
515
- if not list_whole_pages_to_redact:
516
- # Assuming gr is defined (e.g., gradio)
517
- print("No relevant list of whole pages to redact found, returning inputs.")
518
- raise Warning("Warning: No relevant list of whole pages to redact found, returning inputs.")
519
- return review_file_state, all_existing_annotations
 
 
 
 
 
 
 
 
 
 
 
520
 
521
  new_annotations = []
522
-
523
  # Process each page for redaction
524
  for page in list_whole_pages_to_redact:
525
  try:
526
- page_index = int(page) - 1
527
- if page_index < 0 or page_index >= len(pymupdf_doc):
528
- print(f"Page {page} is out of bounds for a document with {len(pymupdf_doc)} pages, skipping.")
 
529
  continue
530
-
531
- pymupdf_page = pymupdf_doc[page_index]
532
 
533
- # Find the matching page size dictionary
534
- page_size = next((size for size in page_sizes if size["page"] == int(page)), None)
535
-
536
- if not page_size:
537
- print(f"Page {page} not found in page_sizes object, skipping.")
538
  continue
539
 
540
- rect_height = page_size["cropbox_height"]
541
- rect_width = page_size["cropbox_width"]
542
- image = page_size["image_path"] # This `image` likely represents the page identifier
543
-
544
- # Create the whole page redaction box
545
- annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, pymupdf_page, border=0.005, redact_pdf=False)
 
 
546
 
547
- # Find existing annotation for this image/page
548
- current_page_existing_boxes_group = next((annot_group for annot_group in all_annotations if annot_group["image"] == image), None)
549
 
550
- new_annotation_group = {
551
- "image": image,
552
- "boxes": [annotation_box]
553
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
554
 
555
- if current_page_existing_boxes_group:
556
- # Check if we already have a whole page redaction for this page
557
- if not any(box["label"] == "Whole page" for box in current_page_existing_boxes_group["boxes"]):
558
- current_page_existing_boxes_group["boxes"].append(annotation_box)
559
 
560
- else:
561
- # Optional: Print a message if a whole-page redaction already exists for this page
562
- print(f"Whole page redaction for page {page} already exists in annotations, skipping addition.")
563
- pass
564
- else: # Create new annotation entry
565
-
566
- all_annotations.append(new_annotation_group)
567
-
568
- new_annotations.append(new_annotation_group)
569
-
570
  except Exception as e:
571
  print(f"Error processing page {page}: {str(e)}")
572
  continue
573
 
574
- # Convert annotations to dataframe and combine with existing review file
575
  whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
576
-
577
- # Ensure all required columns are present in both DataFrames before concat
578
- # This is a common point of error if DFs have different schemas
 
 
 
 
579
  expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
580
-
581
  for col in expected_cols:
582
- if col not in review_file_state.columns:
583
- review_file_state[col] = None # Or an appropriate default value
584
- if col not in whole_page_review_file.columns:
585
- whole_page_review_file[col] = None
586
 
587
  review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
588
- review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"])
 
 
 
 
 
 
 
589
 
590
- # --- Remove duplicate entries from the final DataFrame ---
591
- dedup_subset_cols = ['page', 'label', 'text', 'id']
 
 
 
 
 
592
 
593
- # Ensure these columns exist before trying to use them as subset for drop_duplicates
594
- if all(col in review_file_out.columns for col in dedup_subset_cols):
595
- review_file_out = review_file_out.drop_duplicates(
596
- subset=dedup_subset_cols,
597
- keep='first' # Keep the first occurrence of a duplicate redaction
598
- )
599
- else:
600
- print(f"Warning: Not all columns required for de-duplication ({dedup_subset_cols}) are present in review_file_out. Skipping specific de-duplication.")
601
- # You might want a fallback or to inspect what's missing
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
 
603
- review_file_out.to_csv(OUTPUT_FOLDER + "review_file_out_after_whole_page.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
- gr.Info("Successfully created whole page redactions. Go to the 'Review redactions' tab to see them.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
606
 
607
- return review_file_out, all_annotations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from tools.helper_functions import OUTPUT_FOLDER
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
7
+ from typing import List, Tuple, Optional, Dict
8
+ from collections import defaultdict
9
  import gradio as gr
10
  from gradio import Progress
11
  from pathlib import Path
12
  from pymupdf import Document
13
+ from tools.file_conversion import redact_whole_pymupdf_page, convert_annotation_data_to_dataframe, fill_missing_box_ids_each_box
14
  import en_core_web_lg
15
+
16
  nlp = en_core_web_lg.load()
17
 
18
  similarity_threshold = 0.95
 
58
  grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
59
  else:
60
  df['line_number_by_page'] = df.groupby('page').cumcount() + 1
61
+ df['original_page'] = df['page']
62
  df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
63
  df['page'] = df['page'].astype(int)
64
 
65
+ grouped = df #.drop('line_number_by_page', axis=1)
66
 
67
  # Add filename column
68
  grouped['file'] = os.path.basename(file_path)
 
408
  # Return the updated dataframe, the new file list, and clear the preview panes
409
  return updated_df, new_output_paths, None, None
410
 
411
+ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, combine_pages:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
412
  """
413
  Wrapper function updated to include the 'greedy_match' boolean.
414
  """
 
417
  return None, None, None
418
 
419
  progress(0, desc="Combining input files...")
420
+ df_combined, _ = combine_ocr_output_text(files, combine_pages=combine_pages)
421
 
422
  if df_combined.empty:
423
  gr.Warning("No data found in the uploaded files.")
 
430
  min_word_count=min_words,
431
  min_consecutive_pages=int(min_consecutive),
432
  greedy_match=greedy_match,
433
+ combine_pages=combine_pages,
434
  progress=progress
435
  )
436
 
 
479
 
480
  return page1_data[['page', 'text']], page2_data[['page', 'text']]
481
 
482
+ def get_page_image_info(page_num: int, page_sizes: List[Dict]) -> Optional[Dict]:
483
+ """
484
+ Finds and returns the size and path information for a specific page.
485
+ """
486
+ return next((size for size in page_sizes if size["page"] == page_num), None)
487
+
488
+ def add_new_annotations_to_existing_page_annotations(
489
+ all_annotations: List[Dict],
490
+ image_path: str,
491
+ new_annotation_boxes: List[Dict]
492
+ ) -> Tuple[List[Dict], Dict]:
493
+ """
494
+ Adds a list of new annotation boxes to the annotations for a specific page.
495
+
496
+ If the page already has annotations, it extends the list of boxes. If not,
497
+ it creates a new entry for the page.
498
+
499
+ Args:
500
+ all_annotations (List[Dict]): The current list of all annotation groups.
501
+ image_path (str): The identifier for the image/page.
502
+ new_annotation_boxes (List[Dict]): A list of new annotation boxes to add.
503
+
504
+ Returns:
505
+ Tuple[List[Dict], Dict]: A tuple containing:
506
+ - The updated list of all annotation groups.
507
+ - The annotation group representing the newly added boxes.
508
+ """
509
+ # Find the annotation group for the current page/image
510
+ current_page_group = next(
511
+ (annot_group for annot_group in all_annotations if annot_group["image"] == image_path),
512
+ None
513
+ )
514
+
515
+ if current_page_group:
516
+ # Page already has annotations, so extend the list with the new boxes
517
+ current_page_group["boxes"].extend(new_annotation_boxes)
518
+ else:
519
+ # This is the first set of annotations for this page, create a new group
520
+ new_group = {
521
+ "image": image_path,
522
+ "boxes": new_annotation_boxes
523
+ }
524
+ all_annotations.append(new_group)
525
+
526
+ # This object represents all annotations that were just added for this page
527
+ newly_added_annotation_group = {
528
+ "image": image_path,
529
+ "boxes": new_annotation_boxes
530
+ }
531
+
532
+ return all_annotations, newly_added_annotation_group
533
+
534
+ def apply_whole_page_redactions_from_list(duplicate_page_numbers_df: pd.DataFrame, doc_file_name_with_extension_textbox: str, review_file_state: pd.DataFrame, duplicate_output_paths: list[str], pymupdf_doc: object, page_sizes: list[dict], all_existing_annotations: list[dict], combine_pages:bool=True, new_annotations_with_bounding_boxes:List[dict]=[]):
535
  '''
536
+ Take a list of suggested whole pages to redact and apply it to review file data.
537
  '''
 
538
  all_annotations = all_existing_annotations.copy()
539
 
540
+ if not pymupdf_doc:
541
+ message = "No document file currently under review."
542
+ print(f"Warning: {message}")
543
+ raise Warning(message)
544
 
545
+ list_whole_pages_to_redact = []
546
+
547
+ if combine_pages == True:
548
+ # Get list of pages to redact from either dataframe or file
549
+ if not duplicate_page_numbers_df.empty:
550
+ list_whole_pages_to_redact = duplicate_page_numbers_df.iloc[:, 0].tolist()
551
+ elif duplicate_output_paths:
552
+ expected_duplicate_pages_to_redact_name = f"{doc_file_name_with_extension_textbox}"
553
+ whole_pages_list = pd.DataFrame() # Initialize empty DataFrame
554
+
555
+ for output_file in duplicate_output_paths:
556
+ # Note: output_file.name might not be available if output_file is just a string path
557
+ # If it's a Path object or similar, .name is fine. Otherwise, parse from string.
558
+ file_name_from_path = output_file.split('/')[-1] if isinstance(output_file, str) else output_file.name
559
+ if expected_duplicate_pages_to_redact_name in file_name_from_path:
560
+ whole_pages_list = pd.read_csv(output_file, header=None) # Use output_file directly if it's a path
561
+ break
562
+ else:
563
+ message = "No relevant list of whole pages to redact found."
564
+ print(message)
565
+ raise Warning(message)
566
 
567
  if not whole_pages_list.empty:
568
  list_whole_pages_to_redact = whole_pages_list.iloc[:, 0].tolist()
569
+
570
+ list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
571
+
572
+ else:
573
+ if not new_annotations_with_bounding_boxes:
574
+ message = "Can't find any new annotations to add"
575
+ print(message)
576
+ raise Warning(message)
577
+
578
+ list_whole_pages_to_redact = []
579
+ for annotation in new_annotations_with_bounding_boxes:
580
+ match = re.search(r'_(\d+)\.png$', annotation["image"])
581
+ if match:
582
+ page = int(match.group(1)) + 1
583
+ list_whole_pages_to_redact.append(page)
584
+ else:
585
+ print(f"Warning: Could not extract page number from {annotation['image']}")
586
+
587
+ list_whole_pages_to_redact = list(set(list_whole_pages_to_redact))
588
+
589
 
590
  new_annotations = []
 
591
  # Process each page for redaction
592
  for page in list_whole_pages_to_redact:
593
  try:
594
+ page_num = int(page)
595
+ page_index = page_num - 1
596
+ if not (0 <= page_index < len(pymupdf_doc)):
597
+ print(f"Page {page_num} is out of bounds, skipping.")
598
  continue
 
 
599
 
600
+ page_info = get_page_image_info(page_num, page_sizes)
601
+ if not page_info:
602
+ print(f"Page {page_num} not found in page_sizes, skipping.")
 
 
603
  continue
604
 
605
+ image_path = page_info["image_path"]
606
+ page_annotation_group = next((g for g in all_annotations if g["image"] == image_path), None)
607
+ if page_annotation_group and any(box["label"] == "Whole page" for box in page_annotation_group["boxes"]):
608
+ print(f"Whole page redaction for page {page_num} already exists, skipping.")
609
+ continue
610
+
611
+ # --- Create a LIST of boxes to add.---
612
+ boxes_to_add = []
613
 
614
+ pymupdf_page = pymupdf_doc[page_index]
 
615
 
616
+ if combine_pages==True:
617
+ whole_page_box = redact_whole_pymupdf_page(
618
+ rect_height=page_info["cropbox_height"],
619
+ rect_width=page_info["cropbox_width"],
620
+ page=pymupdf_page, border=0.005, redact_pdf=False
621
+ )
622
+ boxes_to_add.append(whole_page_box)
623
+ else:
624
+ # Find the specific annotation group that matches the current page's image path
625
+ relevant_box_group = next(
626
+ (group for group in new_annotations_with_bounding_boxes if group.get('image') == image_path),
627
+ None # Default to None if no match is found
628
+ )
629
+
630
+ # Check if we found a matching group of boxes for this page
631
+ if relevant_box_group:
632
+ boxes_to_add.extend(relevant_box_group['boxes'])
633
+ else:
634
+ # This case would be unexpected, but it's good to handle.
635
+ # It means a page was in list_whole_pages_to_redact but had no
636
+ # corresponding boxes generated in new_annotations_with_bounding_boxes.
637
+ print(f"Warning: No new annotation boxes found for page {page_num} ({image_path}).")
638
+
639
+ # === Use the modified helper function to add a LIST of boxes ===
640
+ all_annotations, new_annotations_for_page = add_new_annotations_to_existing_page_annotations(
641
+ all_annotations=all_annotations,
642
+ image_path=image_path,
643
+ new_annotation_boxes=boxes_to_add # Pass the list here
644
+ )
645
 
646
+ new_annotations_for_page = fill_missing_box_ids_each_box(new_annotations_for_page)
647
+ new_annotations.append(new_annotations_for_page)
 
 
648
 
 
 
 
 
 
 
 
 
 
 
649
  except Exception as e:
650
  print(f"Error processing page {page}: {str(e)}")
651
  continue
652
 
 
653
  whole_page_review_file = convert_annotation_data_to_dataframe(new_annotations)
654
+
655
+ if whole_page_review_file.empty:
656
+ message = "No new whole page redactions were added."
657
+ print(message)
658
+ gr.Info(message)
659
+ return review_file_state, all_annotations
660
+
661
  expected_cols = ['image', 'page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax', 'text', 'id']
 
662
  for col in expected_cols:
663
+ if col not in review_file_state.columns: review_file_state[col] = pd.NA
664
+ if col not in whole_page_review_file.columns: whole_page_review_file[col] = pd.NA
 
 
665
 
666
  review_file_out = pd.concat([review_file_state, whole_page_review_file], ignore_index=True)
667
+ review_file_out = review_file_out.sort_values(by=["page", "ymin", "xmin"]).reset_index(drop=True)
668
+ review_file_out = review_file_out.drop_duplicates(subset=['page', 'label', 'text', 'id'], keep='first')
669
+
670
+ out_message = "Successfully created whole page redactions."
671
+ print(out_message)
672
+ gr.Info(out_message)
673
+
674
+ return review_file_out, all_annotations
675
 
676
+
677
+ # --- 1. Helper Function to Parse the Combined Page/Line ID ---
678
+ def _parse_page_line_id(combined_id: int) -> Tuple[int, int]:
679
+ """
680
+ Parses a combined page and line number ID into a (page, line) tuple.
681
+ Assumes the ID is a 10-digit number where the first 5 are the page
682
+ and the last 5 are the line number.
683
 
684
+ Example: 100027 -> (1, 27)
685
+ 200005 -> (2, 5)
686
+ """
687
+ # zfill ensures the string is padded with leading zeros to 10 characters
688
+ s_id = str(combined_id).zfill(10)
689
+ page = int(s_id[:5])
690
+ line = int(s_id[5:])
691
+ return page, line
692
+
693
+ # def create_annotations_from_ocr_outputs(ocr_results_df_lines_to_annotate:pd.DataFrame):
694
+ # '''
695
+ # Create a set of annotation boxes based on selected ocr_results_df lines.
696
+ # '''
697
+ # annotations_by_page = []
698
+
699
+ # # --- Build Annotation Boxes for each selected line ---
700
+ # for _, line_row in ocr_results_df_lines_to_annotate.iterrows():
701
+ # # The coordinates are relative, so xmax = left + width and ymax = top + height
702
+ # box = {
703
+ # "label": "Similar Text", # Or any other label you prefer
704
+ # "xmin": line_row['left'],
705
+ # "ymin": line_row['top'] + line_row['height'],
706
+ # "xmax": line_row['left'] + line_row['width'],
707
+ # "ymax": line_row['top'] ,
708
+ # "text": line_row['text']
709
+ # }
710
+ # # --- 6. Group the box by its page number ---
711
+ # page_number = line_row['page']
712
+ # annotations_by_page[page_number].append(box)
713
+
714
+ # return annotations_by_page
715
+
716
+ # def create_annotation_objects_from_duplicates(
717
+ # duplicates_df: pd.DataFrame,
718
+ # ocr_results_df: pd.DataFrame,
719
+ # combine_pages:bool=False
720
+ # ) -> List[Dict]:
721
+ # """
722
+ # Creates structured annotation objects from selected ocr outputs.
723
+
724
+ # Args:
725
+ # duplicates_df (pd.DataFrame): DataFrame containing duplicate ranges with
726
+ # columns like 'Page2_Start_Page' and 'Page2_End_Page'.
727
+ # ocr_results_df (pd.DataFrame): DataFrame with OCR results, including columns
728
+ # 'page', 'text', 'left', 'top', 'width', 'height'.
729
+
730
+ # Returns:
731
+ # List[Dict]: A list of dictionaries, where each dict represents a page and its
732
+ # list of annotation boxes, in the format:
733
+ # [{"page": 1, "boxes": [...]}, {"page": 2, "boxes": [...]}]
734
+ # """
735
+ # annotations_by_page = []
736
+
737
+ # if combine_pages == False:
738
+
739
+ # # --- 2. Prepare OCR Data: Add a line number column if it doesn't exist ---
740
+ # if 'line_number_by_page' not in ocr_results_df.columns:
741
+ # print("Generating 'line_number_by_page' for ocr_results_df...")
742
+ # # Sort by page and original position to ensure correct line numbering
743
+ # ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
744
+ # ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
745
+
746
+ # # Use defaultdict to easily append to lists for each page
747
+ # annotations_by_page = defaultdict(list)
748
+
749
+ # # --- 3. Iterate through each duplicate range ---
750
+ # for _, row in duplicates_df.iterrows():
751
+ # # Parse the start and end page/line numbers from the duplicate row
752
+ # start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
753
+ # end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
754
+
755
+ # # --- 4. Select OCR Lines based on the range ---
756
+ # # This logic correctly handles ranges within a single page and across multiple pages
757
+ # if start_page == end_page:
758
+ # # Simple case: the range is on a single page
759
+ # condition = (
760
+ # (ocr_results_df['page'] == start_page) &
761
+ # (ocr_results_df['line_number_by_page'].between(start_line, end_line))
762
+ # )
763
+ # else:
764
+ # # Complex case: the range spans multiple pages
765
+ # # Condition for the first page in the range
766
+ # cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
767
+ # # Condition for all pages between the start and end
768
+ # cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
769
+ # # Condition for the last page in the range
770
+ # cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
771
+
772
+ # condition = cond_start | cond_middle | cond_end
773
 
774
+ # lines_to_annotate = ocr_results_df[condition]
775
+
776
+ # annotations_by_page = create_annotations_from_ocr_outputs(lines_to_annotate)
777
+
778
+ # # --- Format the final output list ---
779
+ # final_output = []
780
+ # # Sort by page number for a predictable order
781
+ # for page, boxes in sorted(annotations_by_page.items()):
782
+ # final_output.append({
783
+ # "page": page,
784
+ # "boxes": boxes
785
+ # })
786
+
787
+ # return final_output
788
+
789
+ def create_annotation_objects_from_duplicates(
790
+ duplicates_df: pd.DataFrame,
791
+ ocr_results_df: pd.DataFrame,
792
+ page_sizes: List[Dict],
793
+ combine_pages:bool=False
794
+ ) -> List[Dict]:
795
+ """
796
+ Creates structured annotation objects from duplicate line ranges, mapping
797
+ page numbers to image paths.
798
 
799
+ Args:
800
+ duplicates_df (pd.DataFrame): DataFrame with duplicate ranges.
801
+ ocr_results_df (pd.DataFrame): DataFrame with OCR results.
802
+ page_sizes (List[Dict]): A list of dictionaries mapping page numbers to image paths and other metadata. Expected format: [{"page": 1, "image_path": "path/to/img.png", ...}]
803
+ combine_pages (bool): A boolean that determines whether in previous functions, all text from a page was combined (True). This function will only run if this is False.
804
+
805
+ Returns:
806
+ List[Dict]: A list of dictionaries, where each dict represents a page and its list of annotation boxes, in the format: [{"image": "path/to/img.png", "boxes": [...]}, ...]
807
+ """
808
+ final_output = []
809
+
810
+ if combine_pages == False:
811
+ # --- NEW: Create an efficient lookup map from page number to image path ---
812
+ page_to_image_map = {item['page']: item['image_path'] for item in page_sizes}
813
+
814
+ # Prepare OCR Data: Add a line number column if it doesn't exist
815
+ if 'line_number_by_page' not in ocr_results_df.columns:
816
+ ocr_results_df = ocr_results_df.sort_values(by=['page', 'top', 'left']).reset_index(drop=True)
817
+ ocr_results_df['line_number_by_page'] = ocr_results_df.groupby('page').cumcount() + 1
818
+
819
+ annotations_by_page = defaultdict(list)
820
+
821
+ # Iterate through each duplicate range (this logic is unchanged)
822
+ for _, row in duplicates_df.iterrows():
823
+ start_page, start_line = _parse_page_line_id(row['Page2_Start_Page'])
824
+ end_page, end_line = _parse_page_line_id(row['Page2_End_Page'])
825
+
826
+ # Select OCR Lines based on the range (this logic is unchanged)
827
+ if start_page == end_page:
828
+ condition = (
829
+ (ocr_results_df['page'] == start_page) &
830
+ (ocr_results_df['line_number_by_page'].between(start_line, end_line))
831
+ )
832
+ else:
833
+ cond_start = (ocr_results_df['page'] == start_page) & (ocr_results_df['line_number_by_page'] >= start_line)
834
+ cond_middle = ocr_results_df['page'].between(start_page + 1, end_page - 1)
835
+ cond_end = (ocr_results_df['page'] == end_page) & (ocr_results_df['line_number_by_page'] <= end_line)
836
+ condition = cond_start | cond_middle | cond_end
837
+
838
+ lines_to_annotate = ocr_results_df[condition]
839
+
840
+ # Build and group annotation boxes by page number (this logic is unchanged)
841
+ for _, line_row in lines_to_annotate.iterrows():
842
+ box = {
843
+ "label": "Duplicate text",
844
+ "color": (0,0,0),
845
+ "xmin": line_row['left'],
846
+ "ymin": line_row['top'],
847
+ "xmax": line_row['left'] + line_row['width'],
848
+ "ymax": line_row['top'] + line_row['height'],
849
+ "text": line_row['text'],
850
+ "id": "" # to be filled in after
851
+ }
852
+ page_number = line_row['page']
853
 
854
+
855
+ annotations_by_page[page_number].append(box)
856
+
857
+ print("annotations_by_page:", annotations_by_page)
858
+
859
+ # --- Format the final output list using the page-to-image map ---
860
+ final_output = []
861
+ # Sort by page number for a predictable order
862
+ for page_num, boxes in sorted(annotations_by_page.items()):
863
+ # Look up the image path using the page number
864
+ image_path = page_to_image_map.get(page_num)
865
+
866
+ if image_path:
867
+ page_boxes = {
868
+ "image": image_path,
869
+ "boxes": boxes
870
+ }
871
+
872
+ # Fill in missing IDs for the new data entries
873
+ page_boxes = fill_missing_box_ids_each_box(page_boxes)
874
+
875
+ # Add the annotation group using 'image' as the key
876
+ final_output.append(page_boxes)
877
+ else:
878
+ # Handle cases where a page might not have a corresponding image path
879
+ print(f"Warning: Page {page_num} found in OCR data but has no corresponding "
880
+ f"entry in the 'page_sizes' object. This page's annotations will be skipped.")
881
+
882
+ print("final_output:", final_output)
883
+
884
+ return final_output
885
+
886
+ # --- Example Usage ---
887
+
888
+ # 1. Create your example DataFrames
889
+ # duplicates_data = {
890
+ # 'Page1_File': ['doc_a.csv'],
891
+ # 'Page1_Start_Page': [100009],
892
+ # 'Page1_End_Page': [100021],
893
+ # 'Page2_File': ['doc_a.csv'],
894
+ # 'Page2_Start_Page': [100027], # Page 1, Line 27
895
+ # 'Page2_End_Page': [200005], # Page 2, Line 5
896
+ # }
897
+ # duplicates_df = pd.DataFrame(duplicates_data)
898
+
899
+ # ocr_data = {
900
+ # 'page': [1]*30 + [2]*10, # 30 lines on page 1, 10 on page 2
901
+ # 'text': [f"Text on page {p}, line {l}" for p in [1, 2] for l in range(1, (31 if p==1 else 11))],
902
+ # # Example coordinates (using small, consistent values for demonstration)
903
+ # 'left': [0.1] * 40,
904
+ # 'top': [i*0.02 for i in range(30)] + [i*0.02 for i in range(10)],
905
+ # 'width': [0.8] * 40,
906
+ # 'height': [0.015] * 40,
907
+ # }
908
+ # ocr_results_df = pd.DataFrame(ocr_data)
909
+
910
+
911
+ # # 2. Run the function
912
+ # generated_annotations = create_annotation_objects_from_duplicates(duplicates_df, ocr_results_df)
913
+
914
+ # # 3. Print the result
915
+ # import json
916
+ # print(json.dumps(generated_annotations, indent=2))
tools/helper_functions.py CHANGED
@@ -89,6 +89,16 @@ def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str,
89
  ]
90
  return cost_code_df
91
 
 
 
 
 
 
 
 
 
 
 
92
  def update_dataframe(df:pd.DataFrame):
93
  df_copy = df.copy()
94
  return df_copy
 
89
  ]
90
  return cost_code_df
91
 
92
+ def ensure_folder_exists(output_folder:str):
93
+ """Checks if the specified folder exists, creates it if not."""
94
+
95
+ if not os.path.exists(output_folder):
96
+ # Create the folder if it doesn't exist
97
+ os.makedirs(output_folder, exist_ok=True)
98
+ print(f"Created the {output_folder} folder.")
99
+ else:
100
+ print(f"The {output_folder} folder already exists.")
101
+
102
  def update_dataframe(df:pd.DataFrame):
103
  df_copy = df.copy()
104
  return df_copy
tools/redaction_review.py CHANGED
@@ -1429,7 +1429,6 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List
1429
 
1430
  return output_paths
1431
 
1432
-
1433
  ### Convert xfdf coordinates back to image for app
1434
 
1435
  def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
 
1429
 
1430
  return output_paths
1431
 
 
1432
  ### Convert xfdf coordinates back to image for app
1433
 
1434
  def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):