seanpedrickcase commited on
Commit
e424038
·
1 Parent(s): 9f51e70

Updated packages. Corrected CSV logger headings, can now submit custom log csv names to S3. Started work on identifying and deduplicating at the line level

Browse files
app.py CHANGED
@@ -2,7 +2,7 @@ import os
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
6
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
@@ -91,12 +91,12 @@ with app:
91
  backup_image_annotations_state = gr.State([])
92
  backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
93
 
94
- # Logging state
95
- feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
96
- feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
97
  access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
98
  access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
99
- usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
 
 
100
  usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
101
 
102
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -408,6 +408,7 @@ with app:
408
  with gr.Row():
409
  duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
410
  min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
 
411
 
412
  gr.Markdown("#### Matching Strategy")
413
  greedy_match_input = gr.Checkbox(
@@ -681,7 +682,9 @@ with app:
681
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
682
 
683
  # Apply page redactions
684
- annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df], scroll_to_output=True)
 
 
685
 
686
  # Save current page redactions
687
  update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
@@ -768,7 +771,8 @@ with app:
768
  duplicate_threshold_input,
769
  min_word_count_input,
770
  min_consecutive_pages_input,
771
- greedy_match_input
 
772
  ],
773
  outputs=[
774
  results_df_preview,
@@ -837,8 +841,6 @@ with app:
837
  app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
838
 
839
 
840
- # If relevant environment variable is set, load in the Textract job details
841
-
842
  # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
843
  if GET_DEFAULT_ALLOW_LIST == "True" and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
844
  if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH and RUN_AWS_FUNCTIONS == "1":
@@ -870,40 +872,39 @@ with app:
870
  ### ACCESS LOGS
871
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
872
  access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
 
873
  access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
874
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
875
  success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
876
 
877
  ### FEEDBACK LOGS
 
 
 
878
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
879
  # User submitted feedback for pdf redactions
880
- pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
881
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
882
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
883
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
884
 
885
- # User submitted feedback for data redactions
886
- data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
887
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
888
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
889
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
890
  else:
891
  # User submitted feedback for pdf redactions
892
- pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
893
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
894
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
895
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
896
 
897
  # User submitted feedback for data redactions
898
- data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
899
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
900
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
901
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
902
 
903
  ### USAGE LOGS
904
  # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
905
-
906
- usage_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
907
 
908
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
909
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
@@ -917,7 +918,7 @@ with app:
917
  successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
918
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
919
  else:
920
- usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
921
 
922
  latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
923
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME
6
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
7
  from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
8
  from tools.file_redaction import choose_and_run_redactor
 
91
  backup_image_annotations_state = gr.State([])
92
  backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
93
 
94
+ # Logging variables
 
 
95
  access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
96
  access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
97
+ feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
98
+ feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
99
+ usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
100
  usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
101
 
102
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
 
408
  with gr.Row():
409
  duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
410
  min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
411
+ duplicates_by_line_or_page_bool = gr.Checkbox(value=True, label="Analyse duplicate text by page (off for by line)")
412
 
413
  gr.Markdown("#### Matching Strategy")
414
  greedy_match_input = gr.Checkbox(
 
682
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
683
 
684
  # Apply page redactions
685
+ annotation_button_apply.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
686
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
687
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df], scroll_to_output=True)
688
 
689
  # Save current page redactions
690
  update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
 
771
  duplicate_threshold_input,
772
  min_word_count_input,
773
  min_consecutive_pages_input,
774
+ greedy_match_input,
775
+ duplicates_by_line_or_page_bool
776
  ],
777
  outputs=[
778
  results_df_preview,
 
841
  app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
842
 
843
 
 
 
844
  # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
845
  if GET_DEFAULT_ALLOW_LIST == "True" and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
846
  if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH and RUN_AWS_FUNCTIONS == "1":
 
872
  ### ACCESS LOGS
873
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
874
  access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
875
+
876
  access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
877
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
878
  success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
879
 
880
  ### FEEDBACK LOGS
881
+ pdf_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
882
+ data_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
883
+
884
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
885
  # User submitted feedback for pdf redactions
 
886
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
887
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
888
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
889
 
890
+ # User submitted feedback for data redactions
 
891
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
892
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
893
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
894
  else:
895
  # User submitted feedback for pdf redactions
 
896
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
897
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
898
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
899
 
900
  # User submitted feedback for data redactions
 
901
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
902
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
903
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
904
 
905
  ### USAGE LOGS
906
  # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
907
+ usage_callback = CSVLogger_custom(dataset_file_name=USAGE_LOG_FILE_NAME)
 
908
 
909
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
910
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
 
918
  successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
919
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
920
  else:
921
+ usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
922
 
923
  latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
924
  success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
cdk/cdk_stack.py CHANGED
@@ -26,7 +26,7 @@ from aws_cdk import (
26
  )
27
 
28
  from constructs import Construct
29
- from cdk_config import CDK_PREFIX, VPC_NAME, AWS_MANAGED_TASK_ROLES_LIST, GITHUB_REPO_USERNAME, GITHUB_REPO_NAME, GITHUB_REPO_BRANCH, ECS_TASK_MEMORY_SIZE, ECS_TASK_CPU_SIZE, CUSTOM_HEADER, CUSTOM_HEADER_VALUE, AWS_REGION, CLOUDFRONT_GEO_RESTRICTION, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS, GRADIO_SERVER_PORT, PUBLIC_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNETS_TO_USE, PRIVATE_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_AVAILABILITY_ZONES, CODEBUILD_PROJECT_NAME, ECS_SECURITY_GROUP_NAME, ALB_NAME_SECURITY_GROUP_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, FARGATE_TASK_DEFINITION_NAME, ECS_SERVICE_NAME, WEB_ACL_NAME, CLOUDFRONT_DISTRIBUTION_NAME, ECS_TASK_ROLE_NAME, ALB_TARGET_GROUP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACM_SSL_CERTIFICATE_ARN, CLUSTER_NAME, CODEBUILD_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, ECR_CDK_REPO_NAME, ECS_LOG_GROUP_NAME, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, TASK_DEFINITION_FILE_LOCATION, EXISTING_IGW_ID, SINGLE_NAT_GATEWAY_ID, NAT_GATEWAY_NAME, COGNITO_USER_POOL_DOMAIN_PREFIX, COGNITO_REDIRECTION_URL, AWS_ACCOUNT_ID, ECS_USE_FARGATE_SPOT, ECS_READ_ONLY_FILE_SYSTEM, USE_CLOUDFRONT, LOAD_BALANCER_WEB_ACL_NAME, NEW_VPC_DEFAULT_NAME, NEW_VPC_CIDR, USE_CUSTOM_KMS_KEY, S3_KMS_KEY_NAME
30
  from cdk_functions import create_subnets, create_web_acl_with_common_rules, add_custom_policies, add_alb_https_listener_with_cert, create_nat_gateway # Only keep CDK-native functions
31
 
32
  def _get_env_list(env_var_name: str) -> List[str]:
@@ -420,7 +420,7 @@ class CdkStack(Stack):
420
 
421
  # --- IAM Roles ---
422
  if USE_CUSTOM_KMS_KEY == '1':
423
- kms_key = kms.Key(self, "RedactionSharedKmsKey", alias=S3_KMS_KEY_NAME, removal_policy=RemovalPolicy.DESTROY)
424
 
425
  custom_sts_kms_policy_dict = {
426
  "Version": "2012-10-17",
@@ -877,7 +877,7 @@ class CdkStack(Stack):
877
  if get_context_bool(f"exists:{secret_name}"):
878
  # Lookup by name
879
  secret = secretsmanager.Secret.from_secret_name_v2(self, "CognitoSecret", secret_name=secret_name)
880
- print(f"Using existing Secret {secret_name}.")
881
  else:
882
  if USE_CUSTOM_KMS_KEY == '1' and isinstance(kms_key, kms.Key):
883
  secret = secretsmanager.Secret(self, "CognitoSecret", # Logical ID
@@ -899,7 +899,7 @@ class CdkStack(Stack):
899
  }
900
  )
901
 
902
- print(f"Created new secret {secret_name}.")
903
 
904
  except Exception as e:
905
  raise Exception("Could not handle Secrets Manager secret due to:", e)
@@ -1235,7 +1235,7 @@ class CdkStack(Stack):
1235
  self,
1236
  "MyHttpsListener", # Logical ID for the HTTPS listener
1237
  alb,
1238
- ACM_SSL_CERTIFICATE_ARN=ACM_SSL_CERTIFICATE_ARN,
1239
  default_target_group=target_group,
1240
  enable_cognito_auth=True,
1241
  cognito_user_pool=user_pool,
 
26
  )
27
 
28
  from constructs import Construct
29
+ from cdk_config import CDK_PREFIX, VPC_NAME, AWS_MANAGED_TASK_ROLES_LIST, GITHUB_REPO_USERNAME, GITHUB_REPO_NAME, GITHUB_REPO_BRANCH, ECS_TASK_MEMORY_SIZE, ECS_TASK_CPU_SIZE, CUSTOM_HEADER, CUSTOM_HEADER_VALUE, AWS_REGION, CLOUDFRONT_GEO_RESTRICTION, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS, GRADIO_SERVER_PORT, PUBLIC_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNETS_TO_USE, PRIVATE_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_AVAILABILITY_ZONES, CODEBUILD_PROJECT_NAME, ECS_SECURITY_GROUP_NAME, ALB_NAME_SECURITY_GROUP_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, FARGATE_TASK_DEFINITION_NAME, ECS_SERVICE_NAME, WEB_ACL_NAME, CLOUDFRONT_DISTRIBUTION_NAME, ECS_TASK_ROLE_NAME, ALB_TARGET_GROUP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACM_SSL_CERTIFICATE_ARN, CLUSTER_NAME, CODEBUILD_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, ECR_CDK_REPO_NAME, ECS_LOG_GROUP_NAME, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, TASK_DEFINITION_FILE_LOCATION, EXISTING_IGW_ID, SINGLE_NAT_GATEWAY_ID, NAT_GATEWAY_NAME, COGNITO_USER_POOL_DOMAIN_PREFIX, COGNITO_REDIRECTION_URL, AWS_ACCOUNT_ID, ECS_USE_FARGATE_SPOT, ECS_READ_ONLY_FILE_SYSTEM, USE_CLOUDFRONT, LOAD_BALANCER_WEB_ACL_NAME, NEW_VPC_DEFAULT_NAME, NEW_VPC_CIDR, USE_CUSTOM_KMS_KEY, CUSTOM_KMS_KEY_NAME
30
  from cdk_functions import create_subnets, create_web_acl_with_common_rules, add_custom_policies, add_alb_https_listener_with_cert, create_nat_gateway # Only keep CDK-native functions
31
 
32
  def _get_env_list(env_var_name: str) -> List[str]:
 
420
 
421
  # --- IAM Roles ---
422
  if USE_CUSTOM_KMS_KEY == '1':
423
+ kms_key = kms.Key(self, "RedactionSharedKmsKey", alias=CUSTOM_KMS_KEY_NAME, removal_policy=RemovalPolicy.DESTROY)
424
 
425
  custom_sts_kms_policy_dict = {
426
  "Version": "2012-10-17",
 
877
  if get_context_bool(f"exists:{secret_name}"):
878
  # Lookup by name
879
  secret = secretsmanager.Secret.from_secret_name_v2(self, "CognitoSecret", secret_name=secret_name)
880
+ print(f"Using existing Secret.")
881
  else:
882
  if USE_CUSTOM_KMS_KEY == '1' and isinstance(kms_key, kms.Key):
883
  secret = secretsmanager.Secret(self, "CognitoSecret", # Logical ID
 
899
  }
900
  )
901
 
902
+ print(f"Created new secret in Secrets Manager for Cognito user pool and related details.")
903
 
904
  except Exception as e:
905
  raise Exception("Could not handle Secrets Manager secret due to:", e)
 
1235
  self,
1236
  "MyHttpsListener", # Logical ID for the HTTPS listener
1237
  alb,
1238
+ acm_certificate_arn=ACM_SSL_CERTIFICATE_ARN,
1239
  default_target_group=target_group,
1240
  enable_cognito_auth=True,
1241
  cognito_user_pool=user_pool,
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "0.7.1"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
12
  dependencies = [
13
  "pdfminer.six==20240706",
14
  "pdf2image==1.17.0",
15
- "pymupdf==1.25.3",
16
  "opencv-python==4.10.0.84",
17
  "presidio_analyzer==2.2.358",
18
  "presidio_anonymizer==2.2.358",
@@ -24,14 +24,14 @@ dependencies = [
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
  "gradio==5.34.2",
27
- "boto3==1.38.46",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
30
  "Faker==36.1.1",
31
  "python-levenshtein==0.26.1",
32
  "spaczz==0.6.1",
33
  # Direct URL dependency for gradio_image_annotator wheel
34
- "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.2/gradio_image_annotation-0.3.2-py3-none-any.whl",
35
  "rapidfuzz==3.12.1",
36
  "python-dotenv==1.0.1",
37
  "numpy==1.26.4",
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "0.7.2"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
 
12
  dependencies = [
13
  "pdfminer.six==20240706",
14
  "pdf2image==1.17.0",
15
+ "pymupdf==1.26.1",
16
  "opencv-python==4.10.0.84",
17
  "presidio_analyzer==2.2.358",
18
  "presidio_anonymizer==2.2.358",
 
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
  "gradio==5.34.2",
27
+ "boto3==1.39.1",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
30
  "Faker==36.1.1",
31
  "python-levenshtein==0.26.1",
32
  "spaczz==0.6.1",
33
  # Direct URL dependency for gradio_image_annotator wheel
34
+ "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
35
  "rapidfuzz==3.12.1",
36
  "python-dotenv==1.0.1",
37
  "numpy==1.26.4",
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  pdfminer.six==20240706
2
  pdf2image==1.17.0
3
- pymupdf==1.25.3
4
  opencv-python==4.10.0.84
5
  presidio_analyzer==2.2.358
6
  presidio_anonymizer==2.2.358
@@ -11,14 +11,14 @@ scikit-learn==1.6.1
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
  gradio==5.34.2
14
- boto3==1.38.46
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
17
  Faker==36.1.1
18
  python-levenshtein==0.26.1
19
  spaczz==0.6.1
20
  # The following version
21
- https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.2/gradio_image_annotation-0.3.2-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
  rapidfuzz==3.12.1
23
  python-dotenv==1.0.1
24
  numpy==1.26.4
 
1
  pdfminer.six==20240706
2
  pdf2image==1.17.0
3
+ pymupdf==1.26.1
4
  opencv-python==4.10.0.84
5
  presidio_analyzer==2.2.358
6
  presidio_anonymizer==2.2.358
 
11
  spacy==3.8.7
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
  gradio==5.34.2
14
+ boto3==1.39.1
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
17
  Faker==36.1.1
18
  python-levenshtein==0.26.1
19
  spaczz==0.6.1
20
  # The following version
21
+ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
  rapidfuzz==3.12.1
23
  python-dotenv==1.0.1
24
  numpy==1.26.4
src/installation_guide.qmd CHANGED
@@ -42,30 +42,30 @@ Update your DNS records to include the CNAME record given by AWS. After your sta
42
 
43
  ### 1. Create a python environment, load in packages from `requirements.txt`.
44
 
45
- You need a `cdk.json` in the `cdk` folder. It should contain the following:
46
-
47
- ```json
48
- {
49
- "app": "<PATH TO PYTHON ENVIRONMENT FOLDER WHERE REQUIREMENTS HAVE BEEN LOADED>/python.exe app.py",
50
- "context": {
51
- "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
52
- "@aws-cdk/core:stackRelativeExports": true,
53
- "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
54
- "@aws-cdk/aws-lambda:recognizeVersionProps": true,
55
- "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
56
- "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
57
- "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
58
- "@aws-cdk/core:newStyleStackSynthesis": true,
59
- "aws-cdk:enableDiffNoFail": true,
60
- "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
61
- "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
62
- "@aws-cdk/core:target-partitions": [
63
- "aws",
64
- "aws-cn"
65
- ]
66
- }
67
- }
68
- ```
69
 
70
  ### 2. Create a `cdk_config.env` file in the `config` subfolder.
71
 
@@ -75,24 +75,22 @@ Depending on which environment variables you put in this file, you can choose wh
75
 
76
  Here as a minimum it would be useful to put the following details in the cdk_config.env file (below are all example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
77
 
78
- ```ini
79
- CDK_PREFIX=example-prefix # This prefix will be added to the name of most of the created elements in your stack
80
- NEW_VPC_CIDR=10.0.0.0/24 # The CIDR range for your newly created VPC
81
- AWS_REGION=<your-region> # Region where elements will be created
82
- AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that you will use for deploying the stack
83
- CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
84
- CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
85
-
86
- COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
87
- COGNITO_AUTH=1 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
88
- USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
89
- RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
90
- CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
91
- # If you are using an SSL certificate with your ALB (highly recommended):
92
- ACM_SSL_CERTIFICATE_ARN=<SSL Certificate ARN> # This is the ARN of the SSL certificate that you have installed in AWS Certificate Manager
93
- SSL_CERTIFICATE_DOMAIN=redaction.example.com # This is the domain of the SSL certificate that you have installed in AWS Certificate Manager
94
-
95
- ```
96
 
97
  **Note: If you are using an SSL certificate with Cognito login on the application load balancer (strongly advised), you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**
98
 
 
42
 
43
  ### 1. Create a python environment, load in packages from `requirements.txt`.
44
 
45
+ You need a `cdk.json` in the `cdk` folder. It should contain the following:
46
+
47
+ ```json
48
+ {
49
+ "app": "<PATH TO PYTHON ENVIRONMENT FOLDER WHERE REQUIREMENTS HAVE BEEN LOADED>/python.exe app.py",
50
+ "context": {
51
+ "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
52
+ "@aws-cdk/core:stackRelativeExports": true,
53
+ "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
54
+ "@aws-cdk/aws-lambda:recognizeVersionProps": true,
55
+ "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
56
+ "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
57
+ "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
58
+ "@aws-cdk/core:newStyleStackSynthesis": true,
59
+ "aws-cdk:enableDiffNoFail": true,
60
+ "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
61
+ "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
62
+ "@aws-cdk/core:target-partitions": [
63
+ "aws",
64
+ "aws-cn"
65
+ ]
66
+ }
67
+ }
68
+ ```
69
 
70
  ### 2. Create a `cdk_config.env` file in the `config` subfolder.
71
 
 
75
 
76
  Here as a minimum it would be useful to put the following details in the cdk_config.env file (below are all example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
77
 
78
+ ```ini
79
+ CDK_PREFIX=example-prefix # This prefix will be added to the name of most of the created elements in your stack
80
+ NEW_VPC_CIDR=10.0.0.0/24 # The CIDR range for your newly created VPC
81
+ AWS_REGION=<your-region> # Region where elements will be created
82
+ AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that you will use for deploying the stack
83
+ CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
84
+ CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
85
+ COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
86
+ COGNITO_AUTH=1 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
87
+ USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
88
+ RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
89
+ CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
90
+ # If you are using an SSL certificate with your ALB (highly recommended):
91
+ ACM_SSL_CERTIFICATE_ARN=<SSL Certificate ARN> # This is the ARN of the SSL certificate that you have installed in AWS Certificate Manager
92
+ SSL_CERTIFICATE_DOMAIN=redaction.example.com # This is the domain of the SSL certificate that you have installed in AWS Certificate Manager
93
+ ```
 
 
94
 
95
  **Note: If you are using an SSL certificate with Cognito login on the application load balancer (strongly advised), you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**
96
 
tools/config.py CHANGED
@@ -209,6 +209,9 @@ if LOGGING == 'True':
209
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
210
 
211
  LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
 
 
 
212
 
213
 
214
  ###
 
209
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
210
 
211
  LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
212
+ USAGE_LOG_FILE_NAME = get_or_create_env_var('USAGE_LOG_FILE_NAME', LOG_FILE_NAME)
213
+ FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
214
+
215
 
216
 
217
  ###
tools/custom_csvlogger.py CHANGED
@@ -17,7 +17,6 @@ from gradio_client import utils as client_utils
17
  import gradio as gr
18
  from gradio import utils, wasm_utils
19
  from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
20
- from botocore.exceptions import NoCredentialsError, TokenRetrievalError
21
 
22
 
23
  if TYPE_CHECKING:
@@ -78,12 +77,15 @@ class CSVLogger_custom(FlaggingCallback):
78
  os.makedirs(self.flagging_dir, exist_ok=True)
79
 
80
  if replacement_headers:
 
 
 
81
  if len(replacement_headers) != len(self.components):
82
  raise ValueError(
83
  f"replacement_headers must have the same length as components "
84
  f"({len(replacement_headers)} provided, {len(self.components)} expected)"
85
  )
86
- headers = replacement_headers + ["timestamp"]
87
  else:
88
  if additional_headers is None:
89
  additional_headers = []
@@ -141,12 +143,14 @@ class CSVLogger_custom(FlaggingCallback):
141
  replacement_headers: list[str] | None = None
142
  ) -> int:
143
  if self.first_time:
 
144
  additional_headers = []
145
  if flag_option is not None:
146
  additional_headers.append("flag")
147
  if username is not None:
148
  additional_headers.append("username")
149
  additional_headers.append("id")
 
150
  self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
151
  self.first_time = False
152
 
@@ -177,13 +181,12 @@ class CSVLogger_custom(FlaggingCallback):
177
  if username is not None:
178
  csv_data.append(username)
179
 
180
-
181
- timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
182
- csv_data.append(timestamp)
183
-
184
  generated_id = str(uuid.uuid4())
185
  csv_data.append(generated_id)
186
 
 
 
 
187
  # Build the headers
188
  headers = (
189
  [getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
@@ -192,8 +195,8 @@ class CSVLogger_custom(FlaggingCallback):
192
  headers.append("flag")
193
  if username is not None:
194
  headers.append("username")
195
- headers.append("timestamp")
196
  headers.append("id")
 
197
 
198
  line_count = -1
199
 
 
17
  import gradio as gr
18
  from gradio import utils, wasm_utils
19
  from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
 
20
 
21
 
22
  if TYPE_CHECKING:
 
77
  os.makedirs(self.flagging_dir, exist_ok=True)
78
 
79
  if replacement_headers:
80
+ if additional_headers is None:
81
+ additional_headers = []
82
+
83
  if len(replacement_headers) != len(self.components):
84
  raise ValueError(
85
  f"replacement_headers must have the same length as components "
86
  f"({len(replacement_headers)} provided, {len(self.components)} expected)"
87
  )
88
+ headers = replacement_headers + additional_headers + ["timestamp"]
89
  else:
90
  if additional_headers is None:
91
  additional_headers = []
 
143
  replacement_headers: list[str] | None = None
144
  ) -> int:
145
  if self.first_time:
146
+ print("First time creating file")
147
  additional_headers = []
148
  if flag_option is not None:
149
  additional_headers.append("flag")
150
  if username is not None:
151
  additional_headers.append("username")
152
  additional_headers.append("id")
153
+ #additional_headers.append("timestamp")
154
  self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
155
  self.first_time = False
156
 
 
181
  if username is not None:
182
  csv_data.append(username)
183
 
 
 
 
 
184
  generated_id = str(uuid.uuid4())
185
  csv_data.append(generated_id)
186
 
187
+ timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
188
+ csv_data.append(timestamp)
189
+
190
  # Build the headers
191
  headers = (
192
  [getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
 
195
  headers.append("flag")
196
  if username is not None:
197
  headers.append("username")
 
198
  headers.append("id")
199
+ headers.append("timestamp")
200
 
201
  line_count = -1
202
 
tools/find_duplicate_pages.py CHANGED
@@ -15,7 +15,7 @@ nlp = en_core_web_lg.load()
15
 
16
  similarity_threshold = 0.95
17
 
18
- def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
19
  """
20
  Combines text from multiple CSV files containing page and text columns.
21
  Groups text by file and page number, concatenating text within these groups.
@@ -52,7 +52,14 @@ def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLD
52
  df['text'] = df['text'].fillna('').astype(str)
53
 
54
  # Group by page and concatenate text
55
- grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
 
 
 
 
 
 
 
56
 
57
  # Add filename column
58
  grouped['file'] = os.path.basename(file_path)
@@ -143,7 +150,7 @@ def map_metadata_subdocument(subdocument_df:pd.DataFrame, metadata_source_df:pd.
143
 
144
  return final_df
145
 
146
- def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str) -> list:
147
  """
148
  Saves the main results DataFrame and generates per-file redaction lists.
149
  This function is extracted to be reusable.
@@ -151,6 +158,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str)
151
  Args:
152
  final_df (pd.DataFrame): The DataFrame containing the final match results.
153
  output_folder (str): The folder to save the output files.
 
154
 
155
  Returns:
156
  list: A list of paths to all generated files.
@@ -172,32 +180,33 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str)
172
 
173
  # 2. Save per-file redaction lists
174
  # Use 'Page2_File' as the source of duplicate content
175
- grouping_col = 'Page2_File'
176
- if grouping_col not in final_df.columns:
177
- print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
178
- return output_paths
179
-
180
- for redact_file, group in final_df.groupby(grouping_col):
181
- output_file_name_stem = Path(redact_file).stem
182
- output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
183
-
184
- all_pages_to_redact = set()
185
- is_subdocument_match = 'Page2_Start_Page' in group.columns
186
-
187
- if is_subdocument_match:
188
- for _, row in group.iterrows():
189
- pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
190
- all_pages_to_redact.update(pages_in_range)
191
- else:
192
- pages = group['Page2_Page'].unique()
193
- all_pages_to_redact.update(pages)
194
-
195
- if all_pages_to_redact:
196
- redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
197
- redaction_df.to_csv(output_file_path, header=False, index=False)
 
198
 
199
- output_paths.append(str(output_file_path))
200
- print(f"Redaction list for {redact_file} saved to {output_file_path}")
201
 
202
  return output_paths
203
 
@@ -206,7 +215,8 @@ def identify_similar_pages(
206
  similarity_threshold: float = 0.9,
207
  min_word_count: int = 10,
208
  min_consecutive_pages: int = 1,
209
- greedy_match: bool = False, # NEW parameter
 
210
  output_folder: str = OUTPUT_FOLDER,
211
  progress=Progress(track_tqdm=True)
212
  ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
@@ -341,7 +351,7 @@ def identify_similar_pages(
341
 
342
  progress(0.8, desc="Saving output files")
343
 
344
- output_paths = save_results_and_redaction_lists(final_df, output_folder)
345
 
346
  return final_df, output_paths, df_combined
347
 
@@ -395,7 +405,7 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./
395
  # Return the updated dataframe, the new file list, and clear the preview panes
396
  return updated_df, new_output_paths, None, None
397
 
398
- def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
399
  """
400
  Wrapper function updated to include the 'greedy_match' boolean.
401
  """
@@ -404,7 +414,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
404
  return None, None, None
405
 
406
  progress(0, desc="Combining input files...")
407
- df_combined, _ = combine_ocr_output_text(files)
408
 
409
  if df_combined.empty:
410
  gr.Warning("No data found in the uploaded files.")
@@ -417,6 +427,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
417
  min_word_count=min_words,
418
  min_consecutive_pages=int(min_consecutive),
419
  greedy_match=greedy_match,
 
420
  progress=progress
421
  )
422
 
 
15
 
16
  similarity_threshold = 0.95
17
 
18
+ def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, output_folder:str=OUTPUT_FOLDER):
19
  """
20
  Combines text from multiple CSV files containing page and text columns.
21
  Groups text by file and page number, concatenating text within these groups.
 
52
  df['text'] = df['text'].fillna('').astype(str)
53
 
54
  # Group by page and concatenate text
55
+ if combine_pages == True:
56
+ grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
57
+ else:
58
+ df['line_number_by_page'] = df.groupby('page').cumcount() + 1
59
+ df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
60
+ df['page'] = df['page'].astype(int)
61
+
62
+ grouped = df.drop('line_number_by_page', axis=1)
63
 
64
  # Add filename column
65
  grouped['file'] = os.path.basename(file_path)
 
150
 
151
  return final_df
152
 
153
+ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str, combine_pages:bool = True) -> list:
154
  """
155
  Saves the main results DataFrame and generates per-file redaction lists.
156
  This function is extracted to be reusable.
 
158
  Args:
159
  final_df (pd.DataFrame): The DataFrame containing the final match results.
160
  output_folder (str): The folder to save the output files.
161
+ combine_pages (bool, optional): Boolean to check whether the text from pages have been combined into one, or if instead the duplicate match has been conducted line by line.
162
 
163
  Returns:
164
  list: A list of paths to all generated files.
 
180
 
181
  # 2. Save per-file redaction lists
182
  # Use 'Page2_File' as the source of duplicate content
183
+ if combine_pages == True:
184
+ grouping_col = 'Page2_File'
185
+ if grouping_col not in final_df.columns:
186
+ print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
187
+ return output_paths
188
+
189
+ for redact_file, group in final_df.groupby(grouping_col):
190
+ output_file_name_stem = Path(redact_file).stem
191
+ output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
192
+
193
+ all_pages_to_redact = set()
194
+ is_subdocument_match = 'Page2_Start_Page' in group.columns
195
+
196
+ if is_subdocument_match:
197
+ for _, row in group.iterrows():
198
+ pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
199
+ all_pages_to_redact.update(pages_in_range)
200
+ else:
201
+ pages = group['Page2_Page'].unique()
202
+ all_pages_to_redact.update(pages)
203
+
204
+ if all_pages_to_redact:
205
+ redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
206
+ redaction_df.to_csv(output_file_path, header=False, index=False)
207
 
208
+ output_paths.append(str(output_file_path))
209
+ print(f"Redaction list for {redact_file} saved to {output_file_path}")
210
 
211
  return output_paths
212
 
 
215
  similarity_threshold: float = 0.9,
216
  min_word_count: int = 10,
217
  min_consecutive_pages: int = 1,
218
+ greedy_match: bool = False,
219
+ combine_pages:bool=True,
220
  output_folder: str = OUTPUT_FOLDER,
221
  progress=Progress(track_tqdm=True)
222
  ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
 
351
 
352
  progress(0.8, desc="Saving output files")
353
 
354
+ output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
355
 
356
  return final_df, output_paths, df_combined
357
 
 
405
  # Return the updated dataframe, the new file list, and clear the preview panes
406
  return updated_df, new_output_paths, None, None
407
 
408
+ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, duplicates_by_line_or_page_bool:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
409
  """
410
  Wrapper function updated to include the 'greedy_match' boolean.
411
  """
 
414
  return None, None, None
415
 
416
  progress(0, desc="Combining input files...")
417
+ df_combined, _ = combine_ocr_output_text(files, combine_pages=duplicates_by_line_or_page_bool)
418
 
419
  if df_combined.empty:
420
  gr.Warning("No data found in the uploaded files.")
 
427
  min_word_count=min_words,
428
  min_consecutive_pages=int(min_consecutive),
429
  greedy_match=greedy_match,
430
+ combine_pages=duplicates_by_line_or_page_bool,
431
  progress=progress
432
  )
433