Commit
·
e424038
1
Parent(s):
9f51e70
Updated packages. Corrected CSV logger headings, can now submit custom log csv names to S3. Started work on identifying and deduplicating at the line level
Browse files- app.py +18 -17
- cdk/cdk_stack.py +5 -5
- pyproject.toml +4 -4
- requirements.txt +3 -3
- src/installation_guide.qmd +40 -42
- tools/config.py +3 -0
- tools/custom_csvlogger.py +10 -7
- tools/find_duplicate_pages.py +43 -32
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
|
6 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
@@ -91,12 +91,12 @@ with app:
|
|
91 |
backup_image_annotations_state = gr.State([])
|
92 |
backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
|
93 |
|
94 |
-
# Logging
|
95 |
-
feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
|
96 |
-
feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
|
97 |
access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
|
98 |
access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
|
99 |
-
|
|
|
|
|
100 |
usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
|
101 |
|
102 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
@@ -408,6 +408,7 @@ with app:
|
|
408 |
with gr.Row():
|
409 |
duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
|
410 |
min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
|
|
|
411 |
|
412 |
gr.Markdown("#### Matching Strategy")
|
413 |
greedy_match_input = gr.Checkbox(
|
@@ -681,7 +682,9 @@ with app:
|
|
681 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
|
682 |
|
683 |
# Apply page redactions
|
684 |
-
annotation_button_apply.click(
|
|
|
|
|
685 |
|
686 |
# Save current page redactions
|
687 |
update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
@@ -768,7 +771,8 @@ with app:
|
|
768 |
duplicate_threshold_input,
|
769 |
min_word_count_input,
|
770 |
min_consecutive_pages_input,
|
771 |
-
greedy_match_input
|
|
|
772 |
],
|
773 |
outputs=[
|
774 |
results_df_preview,
|
@@ -837,8 +841,6 @@ with app:
|
|
837 |
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
|
838 |
|
839 |
|
840 |
-
# If relevant environment variable is set, load in the Textract job details
|
841 |
-
|
842 |
# If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
|
843 |
if GET_DEFAULT_ALLOW_LIST == "True" and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
|
844 |
if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH and RUN_AWS_FUNCTIONS == "1":
|
@@ -870,40 +872,39 @@ with app:
|
|
870 |
### ACCESS LOGS
|
871 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
872 |
access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
|
|
|
873 |
access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
|
874 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
|
875 |
success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
876 |
|
877 |
### FEEDBACK LOGS
|
|
|
|
|
|
|
878 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
879 |
# User submitted feedback for pdf redactions
|
880 |
-
pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
|
881 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
882 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
883 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
884 |
|
885 |
-
# User submitted feedback for data redactions
|
886 |
-
data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
|
887 |
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
|
888 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
|
889 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
890 |
else:
|
891 |
# User submitted feedback for pdf redactions
|
892 |
-
pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
|
893 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
894 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
|
895 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
896 |
|
897 |
# User submitted feedback for data redactions
|
898 |
-
data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
|
899 |
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
|
900 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
|
901 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
902 |
|
903 |
### USAGE LOGS
|
904 |
# Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
|
905 |
-
|
906 |
-
usage_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
|
907 |
|
908 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
909 |
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
@@ -917,7 +918,7 @@ with app:
|
|
917 |
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
918 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
919 |
else:
|
920 |
-
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs,
|
921 |
|
922 |
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
923 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
|
|
2 |
import pandas as pd
|
3 |
import gradio as gr
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS, USAGE_LOG_FILE_NAME, FEEDBACK_LOG_FILE_NAME
|
6 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars, _get_env_list
|
7 |
from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
|
8 |
from tools.file_redaction import choose_and_run_redactor
|
|
|
91 |
backup_image_annotations_state = gr.State([])
|
92 |
backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
|
93 |
|
94 |
+
# Logging variables
|
|
|
|
|
95 |
access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
|
96 |
access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
|
97 |
+
feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + FEEDBACK_LOG_FILE_NAME, visible=False)
|
98 |
+
feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
|
99 |
+
usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + USAGE_LOG_FILE_NAME, visible=False)
|
100 |
usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
|
101 |
|
102 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
|
|
408 |
with gr.Row():
|
409 |
duplicate_threshold_input = gr.Number(value=0.95, label="Similarity threshold", info="Score (0-1) to consider pages a match.")
|
410 |
min_word_count_input = gr.Number(value=10, label="Minimum word count", info="Pages with fewer words than this value are ignored.")
|
411 |
+
duplicates_by_line_or_page_bool = gr.Checkbox(value=True, label="Analyse duplicate text by page (off for by line)")
|
412 |
|
413 |
gr.Markdown("#### Matching Strategy")
|
414 |
greedy_match_input = gr.Checkbox(
|
|
|
682 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df])
|
683 |
|
684 |
# Apply page redactions
|
685 |
+
annotation_button_apply.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
686 |
+
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_df, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
687 |
+
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_df, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_df], scroll_to_output=True)
|
688 |
|
689 |
# Save current page redactions
|
690 |
update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
|
|
771 |
duplicate_threshold_input,
|
772 |
min_word_count_input,
|
773 |
min_consecutive_pages_input,
|
774 |
+
greedy_match_input,
|
775 |
+
duplicates_by_line_or_page_bool
|
776 |
],
|
777 |
outputs=[
|
778 |
results_df_preview,
|
|
|
841 |
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_whole_document_textract_input_subfolder, s3_whole_document_textract_output_subfolder, s3_whole_document_textract_logs_subfolder, local_whole_document_textract_logs_subfolder])
|
842 |
|
843 |
|
|
|
|
|
844 |
# If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
|
845 |
if GET_DEFAULT_ALLOW_LIST == "True" and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
|
846 |
if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH and RUN_AWS_FUNCTIONS == "1":
|
|
|
872 |
### ACCESS LOGS
|
873 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
874 |
access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
|
875 |
+
|
876 |
access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
|
877 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
|
878 |
success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
879 |
|
880 |
### FEEDBACK LOGS
|
881 |
+
pdf_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
|
882 |
+
data_callback = CSVLogger_custom(dataset_file_name=FEEDBACK_LOG_FILE_NAME)
|
883 |
+
|
884 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
885 |
# User submitted feedback for pdf redactions
|
|
|
886 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
887 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
888 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
889 |
|
890 |
+
# User submitted feedback for data redactions
|
|
|
891 |
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
|
892 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
|
893 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
894 |
else:
|
895 |
# User submitted feedback for pdf redactions
|
|
|
896 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
897 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
|
898 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
899 |
|
900 |
# User submitted feedback for data redactions
|
|
|
901 |
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
|
902 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
|
903 |
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
904 |
|
905 |
### USAGE LOGS
|
906 |
# Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
|
907 |
+
usage_callback = CSVLogger_custom(dataset_file_name=USAGE_LOG_FILE_NAME)
|
|
|
908 |
|
909 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
910 |
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
|
|
918 |
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
919 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
920 |
else:
|
921 |
+
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
922 |
|
923 |
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, placeholder_doc_file_name_no_extension_textbox_for_logs, blank_data_file_name_no_extension_textbox_for_logs, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
924 |
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
cdk/cdk_stack.py
CHANGED
@@ -26,7 +26,7 @@ from aws_cdk import (
|
|
26 |
)
|
27 |
|
28 |
from constructs import Construct
|
29 |
-
from cdk_config import CDK_PREFIX, VPC_NAME, AWS_MANAGED_TASK_ROLES_LIST, GITHUB_REPO_USERNAME, GITHUB_REPO_NAME, GITHUB_REPO_BRANCH, ECS_TASK_MEMORY_SIZE, ECS_TASK_CPU_SIZE, CUSTOM_HEADER, CUSTOM_HEADER_VALUE, AWS_REGION, CLOUDFRONT_GEO_RESTRICTION, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS, GRADIO_SERVER_PORT, PUBLIC_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNETS_TO_USE, PRIVATE_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_AVAILABILITY_ZONES, CODEBUILD_PROJECT_NAME, ECS_SECURITY_GROUP_NAME, ALB_NAME_SECURITY_GROUP_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, FARGATE_TASK_DEFINITION_NAME, ECS_SERVICE_NAME, WEB_ACL_NAME, CLOUDFRONT_DISTRIBUTION_NAME, ECS_TASK_ROLE_NAME, ALB_TARGET_GROUP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACM_SSL_CERTIFICATE_ARN, CLUSTER_NAME, CODEBUILD_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, ECR_CDK_REPO_NAME, ECS_LOG_GROUP_NAME, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, TASK_DEFINITION_FILE_LOCATION, EXISTING_IGW_ID, SINGLE_NAT_GATEWAY_ID, NAT_GATEWAY_NAME, COGNITO_USER_POOL_DOMAIN_PREFIX, COGNITO_REDIRECTION_URL, AWS_ACCOUNT_ID, ECS_USE_FARGATE_SPOT, ECS_READ_ONLY_FILE_SYSTEM, USE_CLOUDFRONT, LOAD_BALANCER_WEB_ACL_NAME, NEW_VPC_DEFAULT_NAME, NEW_VPC_CIDR, USE_CUSTOM_KMS_KEY,
|
30 |
from cdk_functions import create_subnets, create_web_acl_with_common_rules, add_custom_policies, add_alb_https_listener_with_cert, create_nat_gateway # Only keep CDK-native functions
|
31 |
|
32 |
def _get_env_list(env_var_name: str) -> List[str]:
|
@@ -420,7 +420,7 @@ class CdkStack(Stack):
|
|
420 |
|
421 |
# --- IAM Roles ---
|
422 |
if USE_CUSTOM_KMS_KEY == '1':
|
423 |
-
kms_key = kms.Key(self, "RedactionSharedKmsKey", alias=
|
424 |
|
425 |
custom_sts_kms_policy_dict = {
|
426 |
"Version": "2012-10-17",
|
@@ -877,7 +877,7 @@ class CdkStack(Stack):
|
|
877 |
if get_context_bool(f"exists:{secret_name}"):
|
878 |
# Lookup by name
|
879 |
secret = secretsmanager.Secret.from_secret_name_v2(self, "CognitoSecret", secret_name=secret_name)
|
880 |
-
print(f"Using existing Secret
|
881 |
else:
|
882 |
if USE_CUSTOM_KMS_KEY == '1' and isinstance(kms_key, kms.Key):
|
883 |
secret = secretsmanager.Secret(self, "CognitoSecret", # Logical ID
|
@@ -899,7 +899,7 @@ class CdkStack(Stack):
|
|
899 |
}
|
900 |
)
|
901 |
|
902 |
-
print(f"Created new secret
|
903 |
|
904 |
except Exception as e:
|
905 |
raise Exception("Could not handle Secrets Manager secret due to:", e)
|
@@ -1235,7 +1235,7 @@ class CdkStack(Stack):
|
|
1235 |
self,
|
1236 |
"MyHttpsListener", # Logical ID for the HTTPS listener
|
1237 |
alb,
|
1238 |
-
|
1239 |
default_target_group=target_group,
|
1240 |
enable_cognito_auth=True,
|
1241 |
cognito_user_pool=user_pool,
|
|
|
26 |
)
|
27 |
|
28 |
from constructs import Construct
|
29 |
+
from cdk_config import CDK_PREFIX, VPC_NAME, AWS_MANAGED_TASK_ROLES_LIST, GITHUB_REPO_USERNAME, GITHUB_REPO_NAME, GITHUB_REPO_BRANCH, ECS_TASK_MEMORY_SIZE, ECS_TASK_CPU_SIZE, CUSTOM_HEADER, CUSTOM_HEADER_VALUE, AWS_REGION, CLOUDFRONT_GEO_RESTRICTION, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS, GRADIO_SERVER_PORT, PUBLIC_SUBNETS_TO_USE, PUBLIC_SUBNET_CIDR_BLOCKS, PUBLIC_SUBNET_AVAILABILITY_ZONES, PRIVATE_SUBNETS_TO_USE, PRIVATE_SUBNET_CIDR_BLOCKS, PRIVATE_SUBNET_AVAILABILITY_ZONES, CODEBUILD_PROJECT_NAME, ECS_SECURITY_GROUP_NAME, ALB_NAME_SECURITY_GROUP_NAME, ALB_NAME, COGNITO_USER_POOL_NAME, COGNITO_USER_POOL_CLIENT_NAME, COGNITO_USER_POOL_CLIENT_SECRET_NAME, FARGATE_TASK_DEFINITION_NAME, ECS_SERVICE_NAME, WEB_ACL_NAME, CLOUDFRONT_DISTRIBUTION_NAME, ECS_TASK_ROLE_NAME, ALB_TARGET_GROUP_NAME, S3_LOG_CONFIG_BUCKET_NAME, S3_OUTPUT_BUCKET_NAME, ACM_SSL_CERTIFICATE_ARN, CLUSTER_NAME, CODEBUILD_ROLE_NAME, ECS_TASK_EXECUTION_ROLE_NAME, ECR_CDK_REPO_NAME, ECS_LOG_GROUP_NAME, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, USAGE_LOG_DYNAMODB_TABLE_NAME, TASK_DEFINITION_FILE_LOCATION, EXISTING_IGW_ID, SINGLE_NAT_GATEWAY_ID, NAT_GATEWAY_NAME, COGNITO_USER_POOL_DOMAIN_PREFIX, COGNITO_REDIRECTION_URL, AWS_ACCOUNT_ID, ECS_USE_FARGATE_SPOT, ECS_READ_ONLY_FILE_SYSTEM, USE_CLOUDFRONT, LOAD_BALANCER_WEB_ACL_NAME, NEW_VPC_DEFAULT_NAME, NEW_VPC_CIDR, USE_CUSTOM_KMS_KEY, CUSTOM_KMS_KEY_NAME
|
30 |
from cdk_functions import create_subnets, create_web_acl_with_common_rules, add_custom_policies, add_alb_https_listener_with_cert, create_nat_gateway # Only keep CDK-native functions
|
31 |
|
32 |
def _get_env_list(env_var_name: str) -> List[str]:
|
|
|
420 |
|
421 |
# --- IAM Roles ---
|
422 |
if USE_CUSTOM_KMS_KEY == '1':
|
423 |
+
kms_key = kms.Key(self, "RedactionSharedKmsKey", alias=CUSTOM_KMS_KEY_NAME, removal_policy=RemovalPolicy.DESTROY)
|
424 |
|
425 |
custom_sts_kms_policy_dict = {
|
426 |
"Version": "2012-10-17",
|
|
|
877 |
if get_context_bool(f"exists:{secret_name}"):
|
878 |
# Lookup by name
|
879 |
secret = secretsmanager.Secret.from_secret_name_v2(self, "CognitoSecret", secret_name=secret_name)
|
880 |
+
print(f"Using existing Secret.")
|
881 |
else:
|
882 |
if USE_CUSTOM_KMS_KEY == '1' and isinstance(kms_key, kms.Key):
|
883 |
secret = secretsmanager.Secret(self, "CognitoSecret", # Logical ID
|
|
|
899 |
}
|
900 |
)
|
901 |
|
902 |
+
print(f"Created new secret in Secrets Manager for Cognito user pool and related details.")
|
903 |
|
904 |
except Exception as e:
|
905 |
raise Exception("Could not handle Secrets Manager secret due to:", e)
|
|
|
1235 |
self,
|
1236 |
"MyHttpsListener", # Logical ID for the HTTPS listener
|
1237 |
alb,
|
1238 |
+
acm_certificate_arn=ACM_SSL_CERTIFICATE_ARN,
|
1239 |
default_target_group=target_group,
|
1240 |
enable_cognito_auth=True,
|
1241 |
cognito_user_pool=user_pool,
|
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
-
version = "0.7.
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
|
|
12 |
dependencies = [
|
13 |
"pdfminer.six==20240706",
|
14 |
"pdf2image==1.17.0",
|
15 |
-
"pymupdf==1.
|
16 |
"opencv-python==4.10.0.84",
|
17 |
"presidio_analyzer==2.2.358",
|
18 |
"presidio_anonymizer==2.2.358",
|
@@ -24,14 +24,14 @@ dependencies = [
|
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
"gradio==5.34.2",
|
27 |
-
"boto3==1.
|
28 |
"pyarrow==19.0.1",
|
29 |
"openpyxl==3.1.5",
|
30 |
"Faker==36.1.1",
|
31 |
"python-levenshtein==0.26.1",
|
32 |
"spaczz==0.6.1",
|
33 |
# Direct URL dependency for gradio_image_annotator wheel
|
34 |
-
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.
|
35 |
"rapidfuzz==3.12.1",
|
36 |
"python-dotenv==1.0.1",
|
37 |
"numpy==1.26.4",
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
+
version = "0.7.2"
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
|
|
12 |
dependencies = [
|
13 |
"pdfminer.six==20240706",
|
14 |
"pdf2image==1.17.0",
|
15 |
+
"pymupdf==1.26.1",
|
16 |
"opencv-python==4.10.0.84",
|
17 |
"presidio_analyzer==2.2.358",
|
18 |
"presidio_anonymizer==2.2.358",
|
|
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
"gradio==5.34.2",
|
27 |
+
"boto3==1.39.1",
|
28 |
"pyarrow==19.0.1",
|
29 |
"openpyxl==3.1.5",
|
30 |
"Faker==36.1.1",
|
31 |
"python-levenshtein==0.26.1",
|
32 |
"spaczz==0.6.1",
|
33 |
# Direct URL dependency for gradio_image_annotator wheel
|
34 |
+
"gradio_image_annotation @ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl",
|
35 |
"rapidfuzz==3.12.1",
|
36 |
"python-dotenv==1.0.1",
|
37 |
"numpy==1.26.4",
|
requirements.txt
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
pdfminer.six==20240706
|
2 |
pdf2image==1.17.0
|
3 |
-
pymupdf==1.
|
4 |
opencv-python==4.10.0.84
|
5 |
presidio_analyzer==2.2.358
|
6 |
presidio_anonymizer==2.2.358
|
@@ -11,14 +11,14 @@ scikit-learn==1.6.1
|
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
gradio==5.34.2
|
14 |
-
boto3==1.
|
15 |
pyarrow==19.0.1
|
16 |
openpyxl==3.1.5
|
17 |
Faker==36.1.1
|
18 |
python-levenshtein==0.26.1
|
19 |
spaczz==0.6.1
|
20 |
# The following version
|
21 |
-
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.
|
22 |
rapidfuzz==3.12.1
|
23 |
python-dotenv==1.0.1
|
24 |
numpy==1.26.4
|
|
|
1 |
pdfminer.six==20240706
|
2 |
pdf2image==1.17.0
|
3 |
+
pymupdf==1.26.1
|
4 |
opencv-python==4.10.0.84
|
5 |
presidio_analyzer==2.2.358
|
6 |
presidio_anonymizer==2.2.358
|
|
|
11 |
spacy==3.8.7
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
gradio==5.34.2
|
14 |
+
boto3==1.39.1
|
15 |
pyarrow==19.0.1
|
16 |
openpyxl==3.1.5
|
17 |
Faker==36.1.1
|
18 |
python-levenshtein==0.26.1
|
19 |
spaczz==0.6.1
|
20 |
# The following version
|
21 |
+
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
rapidfuzz==3.12.1
|
23 |
python-dotenv==1.0.1
|
24 |
numpy==1.26.4
|
src/installation_guide.qmd
CHANGED
@@ -42,30 +42,30 @@ Update your DNS records to include the CNAME record given by AWS. After your sta
|
|
42 |
|
43 |
### 1. Create a python environment, load in packages from `requirements.txt`.
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
|
70 |
### 2. Create a `cdk_config.env` file in the `config` subfolder.
|
71 |
|
@@ -75,24 +75,22 @@ Depending on which environment variables you put in this file, you can choose wh
|
|
75 |
|
76 |
Here as a minimum it would be useful to put the following details in the cdk_config.env file (below are all example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
```
|
96 |
|
97 |
**Note: If you are using an SSL certificate with Cognito login on the application load balancer (strongly advised), you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**
|
98 |
|
|
|
42 |
|
43 |
### 1. Create a python environment, load in packages from `requirements.txt`.
|
44 |
|
45 |
+
You need a `cdk.json` in the `cdk` folder. It should contain the following:
|
46 |
+
|
47 |
+
```json
|
48 |
+
{
|
49 |
+
"app": "<PATH TO PYTHON ENVIRONMENT FOLDER WHERE REQUIREMENTS HAVE BEEN LOADED>/python.exe app.py",
|
50 |
+
"context": {
|
51 |
+
"@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
|
52 |
+
"@aws-cdk/core:stackRelativeExports": true,
|
53 |
+
"@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
|
54 |
+
"@aws-cdk/aws-lambda:recognizeVersionProps": true,
|
55 |
+
"@aws-cdk/aws-lambda:recognizeLayerVersion": true,
|
56 |
+
"@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
|
57 |
+
"@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
|
58 |
+
"@aws-cdk/core:newStyleStackSynthesis": true,
|
59 |
+
"aws-cdk:enableDiffNoFail": true,
|
60 |
+
"@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true,
|
61 |
+
"@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
|
62 |
+
"@aws-cdk/core:target-partitions": [
|
63 |
+
"aws",
|
64 |
+
"aws-cn"
|
65 |
+
]
|
66 |
+
}
|
67 |
+
}
|
68 |
+
```
|
69 |
|
70 |
### 2. Create a `cdk_config.env` file in the `config` subfolder.
|
71 |
|
|
|
75 |
|
76 |
Here as a minimum it would be useful to put the following details in the cdk_config.env file (below are all example values, other possible variables to use here can be seen in the `cdk` folder/`cdk_config.py`).
|
77 |
|
78 |
+
```ini
|
79 |
+
CDK_PREFIX=example-prefix # This prefix will be added to the name of most of the created elements in your stack
|
80 |
+
NEW_VPC_CIDR=10.0.0.0/24 # The CIDR range for your newly created VPC
|
81 |
+
AWS_REGION=<your-region> # Region where elements will be created
|
82 |
+
AWS_ACCOUNT_ID=1234567890 # AWS account ID that has administrator access that you will use for deploying the stack
|
83 |
+
CDK_FOLDER=C:/path_to_cdk_folder/ # The place where the cdk folder code is located
|
84 |
+
CONTEXT_FILE=C:/path_to_cdk_folder/cdk.context.json
|
85 |
+
COGNITO_USER_POOL_DOMAIN_PREFIX=redaction-12345 # The prefix of the login / user sign up domain that you want to use with Cognito login. Should not contain the terms amazon, aws, or cognito.
|
86 |
+
COGNITO_AUTH=1 # Do you want to do in-app authentication (username and password only, not necessary if you are using an SSL certificate as recommended below)
|
87 |
+
USE_CLOUDFRONT=True # Recommended. If you intend to use CloudFront as the front URL to your application load balancer (ALB). This has some extra security features that you won't get with just an ALB, e.g. limiting app access by country.
|
88 |
+
RUN_USEAST_STACK=False # Set this to True only if you have permissions to create a Cloudfront distribution and web ACL on top of it in the us-east-1 region. If you don't, the section below shows how you can create the CloudFront resource manually and map it to your application load balancer (as you should have permissions for that if you are admin in your region).
|
89 |
+
CLOUDFRONT_DOMAIN=<example>.cloudfront.net # If you already know the domain of the CloudFront distribution that you want to use, you can add this here.
|
90 |
+
# If you are using an SSL certificate with your ALB (highly recommended):
|
91 |
+
ACM_SSL_CERTIFICATE_ARN=<SSL Certificate ARN> # This is the ARN of the SSL certificate that you have installed in AWS Certificate Manager
|
92 |
+
SSL_CERTIFICATE_DOMAIN=redaction.example.com # This is the domain of the SSL certificate that you have installed in AWS Certificate Manager
|
93 |
+
```
|
|
|
|
|
94 |
|
95 |
**Note: If you are using an SSL certificate with Cognito login on the application load balancer (strongly advised), you can set COGNITO_AUTH to 0 above, as you don't need the second login step to get to the app**
|
96 |
|
tools/config.py
CHANGED
@@ -209,6 +209,9 @@ if LOGGING == 'True':
|
|
209 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
210 |
|
211 |
LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
|
|
|
|
|
|
|
212 |
|
213 |
|
214 |
###
|
|
|
209 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
210 |
|
211 |
LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
|
212 |
+
USAGE_LOG_FILE_NAME = get_or_create_env_var('USAGE_LOG_FILE_NAME', LOG_FILE_NAME)
|
213 |
+
FEEDBACK_LOG_FILE_NAME = get_or_create_env_var('FEEDBACK_LOG_FILE_NAME', LOG_FILE_NAME)
|
214 |
+
|
215 |
|
216 |
|
217 |
###
|
tools/custom_csvlogger.py
CHANGED
@@ -17,7 +17,6 @@ from gradio_client import utils as client_utils
|
|
17 |
import gradio as gr
|
18 |
from gradio import utils, wasm_utils
|
19 |
from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
|
20 |
-
from botocore.exceptions import NoCredentialsError, TokenRetrievalError
|
21 |
|
22 |
|
23 |
if TYPE_CHECKING:
|
@@ -78,12 +77,15 @@ class CSVLogger_custom(FlaggingCallback):
|
|
78 |
os.makedirs(self.flagging_dir, exist_ok=True)
|
79 |
|
80 |
if replacement_headers:
|
|
|
|
|
|
|
81 |
if len(replacement_headers) != len(self.components):
|
82 |
raise ValueError(
|
83 |
f"replacement_headers must have the same length as components "
|
84 |
f"({len(replacement_headers)} provided, {len(self.components)} expected)"
|
85 |
)
|
86 |
-
headers = replacement_headers + ["timestamp"]
|
87 |
else:
|
88 |
if additional_headers is None:
|
89 |
additional_headers = []
|
@@ -141,12 +143,14 @@ class CSVLogger_custom(FlaggingCallback):
|
|
141 |
replacement_headers: list[str] | None = None
|
142 |
) -> int:
|
143 |
if self.first_time:
|
|
|
144 |
additional_headers = []
|
145 |
if flag_option is not None:
|
146 |
additional_headers.append("flag")
|
147 |
if username is not None:
|
148 |
additional_headers.append("username")
|
149 |
additional_headers.append("id")
|
|
|
150 |
self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
|
151 |
self.first_time = False
|
152 |
|
@@ -177,13 +181,12 @@ class CSVLogger_custom(FlaggingCallback):
|
|
177 |
if username is not None:
|
178 |
csv_data.append(username)
|
179 |
|
180 |
-
|
181 |
-
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
|
182 |
-
csv_data.append(timestamp)
|
183 |
-
|
184 |
generated_id = str(uuid.uuid4())
|
185 |
csv_data.append(generated_id)
|
186 |
|
|
|
|
|
|
|
187 |
# Build the headers
|
188 |
headers = (
|
189 |
[getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
|
@@ -192,8 +195,8 @@ class CSVLogger_custom(FlaggingCallback):
|
|
192 |
headers.append("flag")
|
193 |
if username is not None:
|
194 |
headers.append("username")
|
195 |
-
headers.append("timestamp")
|
196 |
headers.append("id")
|
|
|
197 |
|
198 |
line_count = -1
|
199 |
|
|
|
17 |
import gradio as gr
|
18 |
from gradio import utils, wasm_utils
|
19 |
from tools.config import AWS_REGION, AWS_ACCESS_KEY, AWS_SECRET_KEY, RUN_AWS_FUNCTIONS
|
|
|
20 |
|
21 |
|
22 |
if TYPE_CHECKING:
|
|
|
77 |
os.makedirs(self.flagging_dir, exist_ok=True)
|
78 |
|
79 |
if replacement_headers:
|
80 |
+
if additional_headers is None:
|
81 |
+
additional_headers = []
|
82 |
+
|
83 |
if len(replacement_headers) != len(self.components):
|
84 |
raise ValueError(
|
85 |
f"replacement_headers must have the same length as components "
|
86 |
f"({len(replacement_headers)} provided, {len(self.components)} expected)"
|
87 |
)
|
88 |
+
headers = replacement_headers + additional_headers + ["timestamp"]
|
89 |
else:
|
90 |
if additional_headers is None:
|
91 |
additional_headers = []
|
|
|
143 |
replacement_headers: list[str] | None = None
|
144 |
) -> int:
|
145 |
if self.first_time:
|
146 |
+
print("First time creating file")
|
147 |
additional_headers = []
|
148 |
if flag_option is not None:
|
149 |
additional_headers.append("flag")
|
150 |
if username is not None:
|
151 |
additional_headers.append("username")
|
152 |
additional_headers.append("id")
|
153 |
+
#additional_headers.append("timestamp")
|
154 |
self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
|
155 |
self.first_time = False
|
156 |
|
|
|
181 |
if username is not None:
|
182 |
csv_data.append(username)
|
183 |
|
|
|
|
|
|
|
|
|
184 |
generated_id = str(uuid.uuid4())
|
185 |
csv_data.append(generated_id)
|
186 |
|
187 |
+
timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] # Correct format for Amazon Athena
|
188 |
+
csv_data.append(timestamp)
|
189 |
+
|
190 |
# Build the headers
|
191 |
headers = (
|
192 |
[getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
|
|
|
195 |
headers.append("flag")
|
196 |
if username is not None:
|
197 |
headers.append("username")
|
|
|
198 |
headers.append("id")
|
199 |
+
headers.append("timestamp")
|
200 |
|
201 |
line_count = -1
|
202 |
|
tools/find_duplicate_pages.py
CHANGED
@@ -15,7 +15,7 @@ nlp = en_core_web_lg.load()
|
|
15 |
|
16 |
similarity_threshold = 0.95
|
17 |
|
18 |
-
def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLDER):
|
19 |
"""
|
20 |
Combines text from multiple CSV files containing page and text columns.
|
21 |
Groups text by file and page number, concatenating text within these groups.
|
@@ -52,7 +52,14 @@ def combine_ocr_output_text(input_files:List[str], output_folder:str=OUTPUT_FOLD
|
|
52 |
df['text'] = df['text'].fillna('').astype(str)
|
53 |
|
54 |
# Group by page and concatenate text
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
|
57 |
# Add filename column
|
58 |
grouped['file'] = os.path.basename(file_path)
|
@@ -143,7 +150,7 @@ def map_metadata_subdocument(subdocument_df:pd.DataFrame, metadata_source_df:pd.
|
|
143 |
|
144 |
return final_df
|
145 |
|
146 |
-
def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str) -> list:
|
147 |
"""
|
148 |
Saves the main results DataFrame and generates per-file redaction lists.
|
149 |
This function is extracted to be reusable.
|
@@ -151,6 +158,7 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str)
|
|
151 |
Args:
|
152 |
final_df (pd.DataFrame): The DataFrame containing the final match results.
|
153 |
output_folder (str): The folder to save the output files.
|
|
|
154 |
|
155 |
Returns:
|
156 |
list: A list of paths to all generated files.
|
@@ -172,32 +180,33 @@ def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str)
|
|
172 |
|
173 |
# 2. Save per-file redaction lists
|
174 |
# Use 'Page2_File' as the source of duplicate content
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
|
|
198 |
|
199 |
-
|
200 |
-
|
201 |
|
202 |
return output_paths
|
203 |
|
@@ -206,7 +215,8 @@ def identify_similar_pages(
|
|
206 |
similarity_threshold: float = 0.9,
|
207 |
min_word_count: int = 10,
|
208 |
min_consecutive_pages: int = 1,
|
209 |
-
greedy_match: bool = False,
|
|
|
210 |
output_folder: str = OUTPUT_FOLDER,
|
211 |
progress=Progress(track_tqdm=True)
|
212 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
@@ -341,7 +351,7 @@ def identify_similar_pages(
|
|
341 |
|
342 |
progress(0.8, desc="Saving output files")
|
343 |
|
344 |
-
output_paths = save_results_and_redaction_lists(final_df, output_folder)
|
345 |
|
346 |
return final_df, output_paths, df_combined
|
347 |
|
@@ -395,7 +405,7 @@ def exclude_match(results_df:pd.DataFrame, selected_index:int, output_folder="./
|
|
395 |
# Return the updated dataframe, the new file list, and clear the preview panes
|
396 |
return updated_df, new_output_paths, None, None
|
397 |
|
398 |
-
def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
|
399 |
"""
|
400 |
Wrapper function updated to include the 'greedy_match' boolean.
|
401 |
"""
|
@@ -404,7 +414,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
404 |
return None, None, None
|
405 |
|
406 |
progress(0, desc="Combining input files...")
|
407 |
-
df_combined, _ = combine_ocr_output_text(files)
|
408 |
|
409 |
if df_combined.empty:
|
410 |
gr.Warning("No data found in the uploaded files.")
|
@@ -417,6 +427,7 @@ def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:
|
|
417 |
min_word_count=min_words,
|
418 |
min_consecutive_pages=int(min_consecutive),
|
419 |
greedy_match=greedy_match,
|
|
|
420 |
progress=progress
|
421 |
)
|
422 |
|
|
|
15 |
|
16 |
similarity_threshold = 0.95
|
17 |
|
18 |
+
def combine_ocr_output_text(input_files:List[str], combine_pages:bool=True, output_folder:str=OUTPUT_FOLDER):
|
19 |
"""
|
20 |
Combines text from multiple CSV files containing page and text columns.
|
21 |
Groups text by file and page number, concatenating text within these groups.
|
|
|
52 |
df['text'] = df['text'].fillna('').astype(str)
|
53 |
|
54 |
# Group by page and concatenate text
|
55 |
+
if combine_pages == True:
|
56 |
+
grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
|
57 |
+
else:
|
58 |
+
df['line_number_by_page'] = df.groupby('page').cumcount() + 1
|
59 |
+
df['page'] = df['page'].astype(str).str.zfill(5) + df['line_number_by_page'].astype(str).str.zfill(5)
|
60 |
+
df['page'] = df['page'].astype(int)
|
61 |
+
|
62 |
+
grouped = df.drop('line_number_by_page', axis=1)
|
63 |
|
64 |
# Add filename column
|
65 |
grouped['file'] = os.path.basename(file_path)
|
|
|
150 |
|
151 |
return final_df
|
152 |
|
153 |
+
def save_results_and_redaction_lists(final_df: pd.DataFrame, output_folder: str, combine_pages:bool = True) -> list:
|
154 |
"""
|
155 |
Saves the main results DataFrame and generates per-file redaction lists.
|
156 |
This function is extracted to be reusable.
|
|
|
158 |
Args:
|
159 |
final_df (pd.DataFrame): The DataFrame containing the final match results.
|
160 |
output_folder (str): The folder to save the output files.
|
161 |
+
combine_pages (bool, optional): Boolean to check whether the text from pages have been combined into one, or if instead the duplicate match has been conducted line by line.
|
162 |
|
163 |
Returns:
|
164 |
list: A list of paths to all generated files.
|
|
|
180 |
|
181 |
# 2. Save per-file redaction lists
|
182 |
# Use 'Page2_File' as the source of duplicate content
|
183 |
+
if combine_pages == True:
|
184 |
+
grouping_col = 'Page2_File'
|
185 |
+
if grouping_col not in final_df.columns:
|
186 |
+
print("Warning: 'Page2_File' column not found. Cannot generate redaction lists.")
|
187 |
+
return output_paths
|
188 |
+
|
189 |
+
for redact_file, group in final_df.groupby(grouping_col):
|
190 |
+
output_file_name_stem = Path(redact_file).stem
|
191 |
+
output_file_path = output_folder_path / f"{output_file_name_stem}_pages_to_redact.csv"
|
192 |
+
|
193 |
+
all_pages_to_redact = set()
|
194 |
+
is_subdocument_match = 'Page2_Start_Page' in group.columns
|
195 |
+
|
196 |
+
if is_subdocument_match:
|
197 |
+
for _, row in group.iterrows():
|
198 |
+
pages_in_range = range(int(row['Page2_Start_Page']), int(row['Page2_End_Page']) + 1)
|
199 |
+
all_pages_to_redact.update(pages_in_range)
|
200 |
+
else:
|
201 |
+
pages = group['Page2_Page'].unique()
|
202 |
+
all_pages_to_redact.update(pages)
|
203 |
+
|
204 |
+
if all_pages_to_redact:
|
205 |
+
redaction_df = pd.DataFrame(sorted(list(all_pages_to_redact)), columns=['Page_to_Redact'])
|
206 |
+
redaction_df.to_csv(output_file_path, header=False, index=False)
|
207 |
|
208 |
+
output_paths.append(str(output_file_path))
|
209 |
+
print(f"Redaction list for {redact_file} saved to {output_file_path}")
|
210 |
|
211 |
return output_paths
|
212 |
|
|
|
215 |
similarity_threshold: float = 0.9,
|
216 |
min_word_count: int = 10,
|
217 |
min_consecutive_pages: int = 1,
|
218 |
+
greedy_match: bool = False,
|
219 |
+
combine_pages:bool=True,
|
220 |
output_folder: str = OUTPUT_FOLDER,
|
221 |
progress=Progress(track_tqdm=True)
|
222 |
) -> Tuple[pd.DataFrame, List[str], pd.DataFrame]:
|
|
|
351 |
|
352 |
progress(0.8, desc="Saving output files")
|
353 |
|
354 |
+
output_paths = save_results_and_redaction_lists(final_df, output_folder, combine_pages)
|
355 |
|
356 |
return final_df, output_paths, df_combined
|
357 |
|
|
|
405 |
# Return the updated dataframe, the new file list, and clear the preview panes
|
406 |
return updated_df, new_output_paths, None, None
|
407 |
|
408 |
+
def run_duplicate_analysis(files:list[pd.DataFrame], threshold:float, min_words:int, min_consecutive:int, greedy_match:bool, duplicates_by_line_or_page_bool:bool=True, preview_length:int=500, progress=gr.Progress(track_tqdm=True)):
|
409 |
"""
|
410 |
Wrapper function updated to include the 'greedy_match' boolean.
|
411 |
"""
|
|
|
414 |
return None, None, None
|
415 |
|
416 |
progress(0, desc="Combining input files...")
|
417 |
+
df_combined, _ = combine_ocr_output_text(files, combine_pages=duplicates_by_line_or_page_bool)
|
418 |
|
419 |
if df_combined.empty:
|
420 |
gr.Warning("No data found in the uploaded files.")
|
|
|
427 |
min_word_count=min_words,
|
428 |
min_consecutive_pages=int(min_consecutive),
|
429 |
greedy_match=greedy_match,
|
430 |
+
combine_pages=duplicates_by_line_or_page_bool,
|
431 |
progress=progress
|
432 |
)
|
433 |
|