Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jul 2

Commit

e424038

1 Parent(s): 9f51e70

Updated packages. Corrected CSV logger headings, can now submit custom log csv names to S3. Started work on identifying and deduplicating at the line level

Browse files

Files changed (8) hide show

app.py +18 -17
cdk/cdk_stack.py +5 -5
pyproject.toml +4 -4
requirements.txt +3 -3
src/installation_guide.qmd +40 -42
tools/config.py +3 -0
tools/custom_csvlogger.py +10 -7
tools/find_duplicate_pages.py +43 -32

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
 from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
@@ -91,12 +91,12 @@ with app:
     backup_image_annotations_state = gr.State([])
     backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
-    # Logging state
-    feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
-    feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
     access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
     access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
-    usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
     usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -408,6 +408,7 @@ with app:
                 with gr.Row():
                     duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
                     min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
                 gr.Markdown("#### Matching Strategy")
                 greedy_match_input = gr.Checkbox(
@@ -681,7 +682,9 @@ with app:
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     # Apply page redactions
-    annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df], scroll_to_output=True)
     # Save current page redactions
     update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
@@ -768,7 +771,8 @@ with app:
             duplicate_threshold_input,
             min_word_count_input,
             min_consecutive_pages_input,
-            greedy_match_input
         ],
         outputs=[
             results_df_preview,
@@ -837,8 +841,6 @@ with app:
         app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
-    # If relevant environment variable is set, load in the Textract job details
     # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
     if GET_DEFAULT_ALLOW_LIST == "True" and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
         if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH and RUN_AWS_FUNCTIONS == "1":
@@ -870,40 +872,39 @@ with app:
     ### ACCESS LOGS
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
     access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
     success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     ### FEEDBACK LOGS
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         # User submitted feedback for pdf redactions
-        pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
         pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
         pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
-        # User submitted feedback for data redactions
-        data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
         data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
         data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     else:
         # User submitted feedback for pdf redactions
-        pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
         pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
         pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
         # User submitted feedback for data redactions
-        data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
         data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
         data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     ### USAGE LOGS
     # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
-    usage_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
@@ -917,7 +918,7 @@ with app:
         successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     else:
-        usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
         latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])

 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC,  TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME
 from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
 from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
     backup_image_annotations_state = gr.State([])
     backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
+    # Logging variables
     access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
     access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
+    feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
+    feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
+    usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
     usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
                 with gr.Row():
                     duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
                     min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
+                    duplicates_by_line_or_page_bool = gr.Checkbox(value=True, label="Analyse duplicate text by page (off for by line)")
                 gr.Markdown("#### Matching Strategy")
                 greedy_match_input = gr.Checkbox(
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
     # Apply page redactions
+    annotation_button_apply.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+    success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
+    success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df], scroll_to_output=True)
     # Save current page redactions
     update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
             duplicate_threshold_input,
             min_word_count_input,
             min_consecutive_pages_input,
+            greedy_match_input,
+            duplicates_by_line_or_page_bool
         ],
         outputs=[
             results_df_preview,
         app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
     # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
     if GET_DEFAULT_ALLOW_LIST == "True" and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
         if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH and RUN_AWS_FUNCTIONS == "1":
     ### ACCESS LOGS
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
     access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
     session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
     success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     ### FEEDBACK LOGS
+    pdf_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
+    data_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         # User submitted feedback for pdf redactions
         pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
         pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
+        # User submitted feedback for data redactions
         data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
         data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     else:
         # User submitted feedback for pdf redactions
         pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
         pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
         # User submitted feedback for data redactions
         data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
         data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
     ### USAGE LOGS
     # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
+    usage_callback = CSVLogger_custom(dataset_file_name=USAGE_LOG_FILE_NAME)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
         successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     else:
+        usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
         latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
         success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])

cdk/cdk_stack.py CHANGED Viewed

@@ -26,7 +26,7 @@ from aws_cdk import (
 )
 from constructs import Construct
-from cdk_config import CDK_PREFIX, VPC_NAME, AWS_MANAGED_TASK_ROLES_LIST, GITHUB_REPO_USERNAME, GITHUB_REPO_NAME, GITHUB_REPO_BRANCH, ECS_TASK_MEMORY_SIZE, ECS_TASK_CPU_SIZE, CUSTOM_HEADER, CUSTOM_HEADER_VALUE, AWS_REGION, CLOUDFRONT_GEO_RESTRICTION, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS, GRADIO_SERVER_PORT, PUBLIC_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNETS_TO_USE, PRIVATE_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_AVAILABILITY_ZONES, CODEBUILD_PROJECT_NAME, ECS_SECURITY_GROUP_NAME, ALB_NAME_SECURITY_GROUP_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, FARGATE_TASK_DEFINITION_NAME, ECS_SERVICE_NAME, WEB_ACL_NAME, CLOUDFRONT_DISTRIBUTION_NAME, ECS_TASK_ROLE_NAME, ALB_TARGET_GROUP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACM_SSL_CERTIFICATE_ARN, CLUSTER_NAME, CODEBUILD_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, ECR_CDK_REPO_NAME, ECS_LOG_GROUP_NAME, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, TASK_DEFINITION_FILE_LOCATION, EXISTING_IGW_ID, SINGLE_NAT_GATEWAY_ID, NAT_GATEWAY_NAME, COGNITO_USER_POOL_DOMAIN_PREFIX, COGNITO_REDIRECTION_URL, AWS_ACCOUNT_ID, ECS_USE_FARGATE_SPOT, ECS_READ_ONLY_FILE_SYSTEM, USE_CLOUDFRONT, LOAD_BALANCER_WEB_ACL_NAME, NEW_VPC_DEFAULT_NAME, NEW_VPC_CIDR, USE_CUSTOM_KMS_KEY, S3_KMS_KEY_NAME
 from cdk_functions import create_subnets, create_web_acl_with_common_rules, add_custom_policies, add_alb_https_listener_with_cert, create_nat_gateway # Only keep CDK-native functions
 def _get_env_list(env_var_name: str) -> List[str]:
@@ -420,7 +420,7 @@ class CdkStack(Stack):
         # --- IAM Roles ---
         if USE_CUSTOM_KMS_KEY == '1':
-            kms_key = kms.Key(self, "RedactionSharedKmsKey", alias=S3_KMS_KEY_NAME, removal_policy=RemovalPolicy.DESTROY)
             custom_sts_kms_policy_dict = {
         "Version": "2012-10-17",
@@ -877,7 +877,7 @@ class CdkStack(Stack):
             if get_context_bool(f"exists:{secret_name}"):
                  # Lookup by name
                  secret = secretsmanager.Secret.from_secret_name_v2(self, "CognitoSecret", secret_name=secret_name)
-                 print(f"Using existing Secret {secret_name}.")
             else:
                 if USE_CUSTOM_KMS_KEY == '1' and isinstance(kms_key, kms.Key):
                     secret = secretsmanager.Secret(self, "CognitoSecret", # Logical ID
@@ -899,7 +899,7 @@ class CdkStack(Stack):
                         }
                     )
-                print(f"Created new secret {secret_name}.")
         except Exception as e:
              raise Exception("Could not handle Secrets Manager secret due to:", e)
@@ -1235,7 +1235,7 @@ class CdkStack(Stack):
                 self,
                 "MyHttpsListener", # Logical ID for the HTTPS listener
                 alb,
-                ACM_SSL_CERTIFICATE_ARN=ACM_SSL_CERTIFICATE_ARN,
                 default_target_group=target_group,
                 enable_cognito_auth=True,
                 cognito_user_pool=user_pool,

 )
 from constructs import Construct
+from cdk_config import CDK_PREFIX, VPC_NAME, AWS_MANAGED_TASK_ROLES_LIST, GITHUB_REPO_USERNAME, GITHUB_REPO_NAME, GITHUB_REPO_BRANCH, ECS_TASK_MEMORY_SIZE, ECS_TASK_CPU_SIZE, CUSTOM_HEADER, CUSTOM_HEADER_VALUE, AWS_REGION, CLOUDFRONT_GEO_RESTRICTION, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS, GRADIO_SERVER_PORT, PUBLIC_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNETS_TO_USE, PRIVATE_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_AVAILABILITY_ZONES, CODEBUILD_PROJECT_NAME, ECS_SECURITY_GROUP_NAME, ALB_NAME_SECURITY_GROUP_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, FARGATE_TASK_DEFINITION_NAME, ECS_SERVICE_NAME, WEB_ACL_NAME, CLOUDFRONT_DISTRIBUTION_NAME, ECS_TASK_ROLE_NAME, ALB_TARGET_GROUP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACM_SSL_CERTIFICATE_ARN, CLUSTER_NAME, CODEBUILD_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, ECR_CDK_REPO_NAME, ECS_LOG_GROUP_NAME, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, TASK_DEFINITION_FILE_LOCATION, EXISTING_IGW_ID, SINGLE_NAT_GATEWAY_ID, NAT_GATEWAY_NAME, COGNITO_USER_POOL_DOMAIN_PREFIX, COGNITO_REDIRECTION_URL, AWS_ACCOUNT_ID, ECS_USE_FARGATE_SPOT, ECS_READ_ONLY_FILE_SYSTEM, USE_CLOUDFRONT, LOAD_BALANCER_WEB_ACL_NAME, NEW_VPC_DEFAULT_NAME, NEW_VPC_CIDR, USE_CUSTOM_KMS_KEY, CUSTOM_KMS_KEY_NAME
 from cdk_functions import create_subnets, create_web_acl_with_common_rules, add_custom_policies, add_alb_https_listener_with_cert, create_nat_gateway # Only keep CDK-native functions
 def _get_env_list(env_var_name: str) -> List[str]:
         # --- IAM Roles ---
         if USE_CUSTOM_KMS_KEY == '1':
+            kms_key = kms.Key(self, "RedactionSharedKmsKey", alias=CUSTOM_KMS_KEY_NAME, removal_policy=RemovalPolicy.DESTROY)
             custom_sts_kms_policy_dict = {
         "Version": "2012-10-17",
             if get_context_bool(f"exists:{secret_name}"):
                  # Lookup by name
                  secret = secretsmanager.Secret.from_secret_name_v2(self, "CognitoSecret", secret_name=secret_name)
+                 print(f"Using existing Secret.")
             else:
                 if USE_CUSTOM_KMS_KEY == '1' and isinstance(kms_key, kms.Key):
                     secret = secretsmanager.Secret(self, "CognitoSecret", # Logical ID
                         }
                     )
+                print(f"Created new secret in Secrets Manager for Cognito user pool and related details.")
         except Exception as e:
              raise Exception("Could not handle Secrets Manager secret due to:", e)
                 self,
                 "MyHttpsListener", # Logical ID for the HTTPS listener
                 alb,
+                acm_certificate_arn=ACM_SSL_CERTIFICATE_ARN,
                 default_target_group=target_group,
                 enable_cognito_auth=True,
                 cognito_user_pool=user_pool,

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction"
-version = "0.7.1"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
 dependencies = [
     "pdfminer.six==20240706",
     "pdf2image==1.17.0",
-    "pymupdf==1.25.3",
     "opencv-python==4.10.0.84",
     "presidio_analyzer==2.2.358",
     "presidio_anonymizer==2.2.358",
@@ -24,14 +24,14 @@ dependencies = [
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
     "gradio==5.34.2",
-    "boto3==1.38.46",
     "pyarrow==19.0.1",
     "openpyxl==3.1.5",
     "Faker==36.1.1",
     "python-levenshtein==0.26.1",
     "spaczz==0.6.1",
     # Direct URL dependency for gradio_image_annotator wheel
-    "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.2/gradio_image_annotation-0.3.2-py3-none-any.whl",
     "rapidfuzz==3.12.1",
     "python-dotenv==1.0.1",
     "numpy==1.26.4",

 [project]
 name = "doc_redaction"
+version = "0.7.2"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
     "pdfminer.six==20240706",
     "pdf2image==1.17.0",
+    "pymupdf==1.26.1",
     "opencv-python==4.10.0.84",
     "presidio_analyzer==2.2.358",
     "presidio_anonymizer==2.2.358",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
     "gradio==5.34.2",
+    "boto3==1.39.1",
     "pyarrow==19.0.1",
     "openpyxl==3.1.5",
     "Faker==36.1.1",
     "python-levenshtein==0.26.1",
     "spaczz==0.6.1",
     # Direct URL dependency for gradio_image_annotator wheel
+    "gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
     "rapidfuzz==3.12.1",
     "python-dotenv==1.0.1",
     "numpy==1.26.4",

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 pdfminer.six==20240706
 pdf2image==1.17.0
-pymupdf==1.25.3
 opencv-python==4.10.0.84
 presidio_analyzer==2.2.358
 presidio_anonymizer==2.2.358
@@ -11,14 +11,14 @@ scikit-learn==1.6.1
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 gradio==5.34.2
-boto3==1.38.46
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
 # The following version
-https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.2/gradio_image_annotation-0.3.2-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
 rapidfuzz==3.12.1
 python-dotenv==1.0.1
 numpy==1.26.4

 pdfminer.six==20240706
 pdf2image==1.17.0
+pymupdf==1.26.1
 opencv-python==4.10.0.84
 presidio_analyzer==2.2.358
 presidio_anonymizer==2.2.358
 spacy==3.8.7
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 gradio==5.34.2
+boto3==1.39.1
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1
 python-levenshtein==0.26.1
 spaczz==0.6.1
 # The following version
+https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
 rapidfuzz==3.12.1
 python-dotenv==1.0.1
 numpy==1.26.4

src/installation_guide.qmd CHANGED Viewed

@@ -42,30 +42,30 @@ Update your DNS records to include the CNAME record given by AWS. After your sta
 ### 1.  Create a python environment, load in packages from `requirements.txt`.
-    You need a `cdk.json` in the `cdk` folder. It should contain the following:
-    ```json
-    {
-        "app": "<PATH TO PYTHON ENVIRONMENT FOLDER WHERE REQUIREMENTS HAVE BEEN LOADED>/python.exe app.py",
-        "context": {
-          "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
-          "@aws-cdk/core:stackRelativeExports": true,
-          "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
-          "@aws-cdk/aws-lambda:recognizeVersionProps": true,
-          "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
-          "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
-          "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
-          "@aws-cdk/core:newStyleStackSynthesis": true,
-          "aws-cdk:enableDiffNoFail": true,
-          "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
-          "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
-          "@aws-cdk/core:target-partitions": [
-            "aws",
-            "aws-cn"
-          ]
-        }
-      }
-    ```
 ### 2.  Create a `cdk_config.env` file in the `config` subfolder.
@@ -75,24 +75,22 @@ Depending on which environment variables you put in this file, you can choose wh
 Here as a minimum it would be useful to put the following details in the cdk_config.env file (below are all example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
-    ```ini
-    CDK_PREFIX=example-prefix # This prefix will be added to the name of most of the created elements in your stack
-    NEW_VPC_CIDR=10.0.0.0/24 # The CIDR range for your newly created VPC
-    AWS_REGION=<your-region> # Region where elements will be created
-    AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that you will use for deploying the stack
-    CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
-    CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
-    COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
-    COGNITO_AUTH=1 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
-    USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
-    RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
-    CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
-    # If you are using an SSL certificate with your ALB (highly recommended):
-    ACM_SSL_CERTIFICATE_ARN=<SSL Certificate ARN> # This is the ARN of the SSL certificate that you have installed in AWS Certificate Manager
-    SSL_CERTIFICATE_DOMAIN=redaction.example.com # This is the domain of the SSL certificate that you have installed in AWS Certificate Manager
-    ```
 **Note: If you are using an SSL certificate with Cognito login on the application load balancer (strongly advised), you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**

 ### 1.  Create a python environment, load in packages from `requirements.txt`.
+You need a `cdk.json` in the `cdk` folder. It should contain the following:
+```json
+{
+    "app": "<PATH TO PYTHON ENVIRONMENT FOLDER WHERE REQUIREMENTS HAVE BEEN LOADED>/python.exe app.py",
+    "context": {
+        "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
+        "@aws-cdk/core:stackRelativeExports": true,
+        "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
+        "@aws-cdk/aws-lambda:recognizeVersionProps": true,
+        "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
+        "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
+        "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
+        "@aws-cdk/core:newStyleStackSynthesis": true,
+        "aws-cdk:enableDiffNoFail": true,
+        "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
+        "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
+        "@aws-cdk/core:target-partitions": [
+        "aws",
+        "aws-cn"
+        ]
+    }
+    }
+```
 ### 2.  Create a `cdk_config.env` file in the `config` subfolder.
 Here as a minimum it would be useful to put the following details in the cdk_config.env file (below are all example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
+```ini
+CDK_PREFIX=example-prefix # This prefix will be added to the name of most of the created elements in your stack
+NEW_VPC_CIDR=10.0.0.0/24 # The CIDR range for your newly created VPC
+AWS_REGION=<your-region> # Region where elements will be created
+AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that you will use for deploying the stack
+CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
+CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
+COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
+COGNITO_AUTH=1 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
+USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
+RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
+CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
+# If you are using an SSL certificate with your ALB (highly recommended):
+ACM_SSL_CERTIFICATE_ARN=<SSL Certificate ARN> # This is the ARN of the SSL certificate that you have installed in AWS Certificate Manager
+SSL_CERTIFICATE_DOMAIN=redaction.example.com # This is the domain of the SSL certificate that you have installed in AWS Certificate Manager
+```
 **Note: If you are using an SSL certificate with Cognito login on the application load balancer (strongly advised), you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**

tools/config.py CHANGED Viewed

@@ -209,6 +209,9 @@ if LOGGING == 'True':
     logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
 ###

     logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
+USAGE_LOG_FILE_NAME = get_or_create_env_var('USAGE_LOG_FILE_NAME', LOG_FILE_NAME)
+FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
 ###

tools/custom_csvlogger.py CHANGED Viewed

@@ -17,7 +17,6 @@ from gradio_client import utils as client_utils
 import gradio as gr
 from gradio import utils, wasm_utils
 from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
-from botocore.exceptions import NoCredentialsError, TokenRetrievalError
 if TYPE_CHECKING:
@@ -78,12 +77,15 @@ class CSVLogger_custom(FlaggingCallback):
         os.makedirs(self.flagging_dir, exist_ok=True)
         if replacement_headers:
             if len(replacement_headers) != len(self.components):
                 raise ValueError(
                     f"replacement_headers must have the same length as components "
                     f"({len(replacement_headers)} provided, {len(self.components)} expected)"
                 )
-            headers = replacement_headers + ["timestamp"]
         else:
             if additional_headers is None:
                 additional_headers = []
@@ -141,12 +143,14 @@ class CSVLogger_custom(FlaggingCallback):
     replacement_headers: list[str] | None = None
 ) -> int:
         if self.first_time:
             additional_headers = []
             if flag_option is not None:
                 additional_headers.append("flag")
             if username is not None:
                 additional_headers.append("username")
             additional_headers.append("id")
             self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
             self.first_time = False
@@ -177,13 +181,12 @@ class CSVLogger_custom(FlaggingCallback):
         if username is not None:
             csv_data.append(username)
-        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
-        csv_data.append(timestamp)
         generated_id = str(uuid.uuid4())
         csv_data.append(generated_id)
         # Build the headers
         headers = (
             [getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
@@ -192,8 +195,8 @@ class CSVLogger_custom(FlaggingCallback):
             headers.append("flag")
         if username is not None:
             headers.append("username")
-        headers.append("timestamp")
         headers.append("id")
         line_count = -1

 import gradio as gr
 from gradio import utils, wasm_utils
 from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
 if TYPE_CHECKING:
         os.makedirs(self.flagging_dir, exist_ok=True)
         if replacement_headers:
+            if additional_headers is None:
+                additional_headers = []
             if len(replacement_headers) != len(self.components):
                 raise ValueError(
                     f"replacement_headers must have the same length as components "
                     f"({len(replacement_headers)} provided, {len(self.components)} expected)"
                 )
+            headers = replacement_headers + additional_headers + ["timestamp"]
         else:
             if additional_headers is None:
                 additional_headers = []
     replacement_headers: list[str] | None = None
 ) -> int:
         if self.first_time:
+            print("First time creating file")
             additional_headers = []
             if flag_option is not None:
                 additional_headers.append("flag")
             if username is not None:
                 additional_headers.append("username")
             additional_headers.append("id")
+            #additional_headers.append("timestamp")
             self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
             self.first_time = False
         if username is not None:
             csv_data.append(username)
         generated_id = str(uuid.uuid4())
         csv_data.append(generated_id)
+        timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
+        csv_data.append(timestamp)
         # Build the headers
         headers = (
             [getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
             headers.append("flag")
         if username is not None:
             headers.append("username")
         headers.append("id")
+        headers.append("timestamp")
         line_count = -1

tools/find_duplicate_pages.py CHANGED Viewed

@@ -15,7 +15,7 @@ nlp = en_core_web_lg.load()
 similarity_threshold = 0.95
-def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
     """
     Combines text from multiple CSV files containing page and text columns.
     Groups text by file and page number, concatenating text within these groups.
@@ -52,7 +52,14 @@ def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLD
         df['text'] = df['text'].fillna('').astype(str)
         # Group by page and concatenate text
-        grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
         # Add filename column
         grouped['file'] = os.path.basename(file_path)
@@ -143,7 +150,7 @@ def map_metadata_subdocument(subdocument_df:pd.DataFrame, metadata_source_df:pd.
     return final_df
-def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str) -> list:
     """
     Saves the main results DataFrame and generates per-file redaction lists.
     This function is extracted to be reusable.
@@ -151,6 +158,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str)
     Args:
         final_df (pd.DataFrame): The DataFrame containing the final match results.
         output_folder (str): The folder to save the output files.
     Returns:
         list: A list of paths to all generated files.
@@ -172,32 +180,33 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str)
     # 2. Save per-file redaction lists
     # Use 'Page2_File' as the source of duplicate content
-    grouping_col = 'Page2_File'
-    if grouping_col not in final_df.columns:
-        print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
-        return output_paths
-    for redact_file, group in final_df.groupby(grouping_col):
-        output_file_name_stem = Path(redact_file).stem
-        output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
-        all_pages_to_redact = set()
-        is_subdocument_match = 'Page2_Start_Page' in group.columns
-        if is_subdocument_match:
-            for _, row in group.iterrows():
-                pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
-                all_pages_to_redact.update(pages_in_range)
-        else:
-            pages = group['Page2_Page'].unique()
-            all_pages_to_redact.update(pages)
-        if all_pages_to_redact:
-            redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
-            redaction_df.to_csv(output_file_path, header=False, index=False)
-            output_paths.append(str(output_file_path))
-            print(f"Redaction list for {redact_file} saved to {output_file_path}")
     return output_paths
@@ -206,7 +215,8 @@ def identify_similar_pages(
     similarity_threshold: float = 0.9,
     min_word_count: int = 10,
     min_consecutive_pages: int = 1,
-    greedy_match: bool = False, # NEW parameter
     output_folder: str = OUTPUT_FOLDER,
     progress=Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
@@ -341,7 +351,7 @@ def identify_similar_pages(
     progress(0.8, desc="Saving output files")
-    output_paths = save_results_and_redaction_lists(final_df, output_folder)
     return final_df, output_paths, df_combined
@@ -395,7 +405,7 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./
     # Return the updated dataframe, the new file list, and clear the preview panes
     return updated_df, new_output_paths, None, None
-def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
     """
     Wrapper function updated to include the 'greedy_match' boolean.
     """
@@ -404,7 +414,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
         return None, None, None
     progress(0, desc="Combining input files...")
-    df_combined, _ = combine_ocr_output_text(files)
     if df_combined.empty:
         gr.Warning("No data found in the uploaded files.")
@@ -417,6 +427,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
         min_word_count=min_words,
         min_consecutive_pages=int(min_consecutive),
         greedy_match=greedy_match,
         progress=progress
     )

 similarity_threshold = 0.95
+def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, output_folder:str=OUTPUT_FOLDER):
     """
     Combines text from multiple CSV files containing page and text columns.
     Groups text by file and page number, concatenating text within these groups.
         df['text'] = df['text'].fillna('').astype(str)
         # Group by page and concatenate text
+        if combine_pages == True:
+            grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
+        else:
+            df['line_number_by_page'] = df.groupby('page').cumcount() + 1
+            df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
+            df['page'] = df['page'].astype(int)
+            grouped = df.drop('line_number_by_page', axis=1)
         # Add filename column
         grouped['file'] = os.path.basename(file_path)
     return final_df
+def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str, combine_pages:bool = True) -> list:
     """
     Saves the main results DataFrame and generates per-file redaction lists.
     This function is extracted to be reusable.
     Args:
         final_df (pd.DataFrame): The DataFrame containing the final match results.
         output_folder (str): The folder to save the output files.
+        combine_pages (bool, optional): Boolean to check whether the text from pages have been combined into one, or if instead the duplicate match has been conducted line by line.
     Returns:
         list: A list of paths to all generated files.
     # 2. Save per-file redaction lists
     # Use 'Page2_File' as the source of duplicate content
+    if combine_pages == True:
+        grouping_col = 'Page2_File'
+        if grouping_col not in final_df.columns:
+            print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
+            return output_paths
+        for redact_file, group in final_df.groupby(grouping_col):
+            output_file_name_stem = Path(redact_file).stem
+            output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
+            all_pages_to_redact = set()
+            is_subdocument_match = 'Page2_Start_Page' in group.columns
+            if is_subdocument_match:
+                for _, row in group.iterrows():
+                    pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
+                    all_pages_to_redact.update(pages_in_range)
+            else:
+                pages = group['Page2_Page'].unique()
+                all_pages_to_redact.update(pages)
+            if all_pages_to_redact:
+                redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
+                redaction_df.to_csv(output_file_path, header=False, index=False)
+                output_paths.append(str(output_file_path))
+                print(f"Redaction list for {redact_file} saved to {output_file_path}")
     return output_paths
     similarity_threshold: float = 0.9,
     min_word_count: int = 10,
     min_consecutive_pages: int = 1,
+    greedy_match: bool = False,
+    combine_pages:bool=True,
     output_folder: str = OUTPUT_FOLDER,
     progress=Progress(track_tqdm=True)
 ) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
     progress(0.8, desc="Saving output files")
+    output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
     return final_df, output_paths, df_combined
     # Return the updated dataframe, the new file list, and clear the preview panes
     return updated_df, new_output_paths, None, None
+def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, duplicates_by_line_or_page_bool:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
     """
     Wrapper function updated to include the 'greedy_match' boolean.
     """
         return None, None, None
     progress(0, desc="Combining input files...")
+    df_combined, _ = combine_ocr_output_text(files, combine_pages=duplicates_by_line_or_page_bool)
     if df_combined.empty:
         gr.Warning("No data found in the uploaded files.")
         min_word_count=min_words,
         min_consecutive_pages=int(min_consecutive),
         greedy_match=greedy_match,
+        combine_pages=duplicates_by_line_or_page_bool,
         progress=progress
     )