Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Apr 27

Commit

0042e78

1 Parent(s): 93b4c8a

Improved logging format a little. Now possible to save logs to DynamoDB

Browse files

Files changed (8) hide show

README.md +1 -1
app.py +45 -26
tools/aws_functions.py +57 -1
tools/config.py +31 -6
tools/custom_csvlogger.py +141 -26
tools/custom_image_analyser_engine.py +1 -2
tools/helper_functions.py +3 -3
tools/textract_batch_call.py +3 -3

README.md CHANGED Viewed

@@ -426,7 +426,7 @@ When you click the 'convert .xfdf comment file to review_file.csv' button, the a
 ## Using the AWS Textract document API
-This option can be enabled by your system admin, in the config file ('SHOW_BULK_TEXTRACT_CALL_OPTIONS' environment variable, and subsequent variables). Using this, you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here).
 ### Starting a new Textract API job

 ## Using the AWS Textract document API
+This option can be enabled by your system admin, in the config file ('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS' environment variable, and subsequent variables). Using this, you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here).
 ### Starting a new Textract API job

app.py CHANGED Viewed

@@ -4,9 +4,9 @@ import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH
 from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
-from tools.aws_functions import upload_file_to_s3, download_file_from_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
@@ -44,6 +44,22 @@ else:
     default_ocr_val = text_ocr_option
     default_pii_detector = local_pii_detector
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
@@ -149,9 +165,9 @@ with app:
     s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
-    s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_BULK_ANALYSIS_BUCKET, visible=False)
-    s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
-    s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
     no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
     textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
@@ -253,7 +269,7 @@ with app:
                             reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
                             cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
-            if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
                 with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
                     with gr.Row(equal_height=True):
                         gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
@@ -654,7 +670,7 @@ with app:
     # Get connection details on app load
-    if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
         app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
         success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
     else:
@@ -681,7 +697,7 @@ with app:
             print("Downloading cost codes from S3")
             app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
             success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
-            print("Successfully loaded cost codes from S3")
         elif os.path.exists(COST_CODES_PATH):
             print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
             app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
@@ -691,44 +707,47 @@ with app:
     # LOGGING
     ###
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
-    access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
-    session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
-    success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for pdf redactions
     pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
-    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
-    success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
     data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
-    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
-    success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
-    # Log processing time/token usage when making a query
-    usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
-        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
-        success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
-        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
-        success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     else:
         usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
-        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
-        success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
-        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
-        success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 if __name__ == "__main__":
     if RUN_DIRECT_MODE == "0":

 import gradio as gr
 from gradio_image_annotation import image_annotator
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS
 from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
+from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
     default_ocr_val = text_ocr_option
     default_pii_detector = local_pii_detector
+SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
+SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
+print("SAVE_LOGS_TO_CSV:", SAVE_LOGS_TO_CSV)
+print("SAVE_LOGS_TO_DYNAMODB:", SAVE_LOGS_TO_DYNAMODB)
+if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = eval(CSV_ACCESS_LOG_HEADERS)
+if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = eval(CSV_FEEDBACK_LOG_HEADERS)
+if CSV_USAGE_LOG_HEADERS: CSV_USAGE_LOG_HEADERS = eval(CSV_USAGE_LOG_HEADERS)
+if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCESS_LOG_HEADERS)
+if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
+if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
+print
 # Create the gradio interface
 app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
     s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
+    s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
+    s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
+    s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
     no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
     textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
                             reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
                             cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
+            if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
                 with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
                     with gr.Row(equal_height=True):
                         gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
     # Get connection details on app load
+    if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
         app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
         success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
     else:
             print("Downloading cost codes from S3")
             app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
             success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
+            print("Successfully loaded cost codesc from S3")
         elif os.path.exists(COST_CODES_PATH):
             print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
             app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
     # LOGGING
     ###
+    ### ACCESS LOGS
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
+    access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
+    session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
+    success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+    ### FEEDBACK LOGS
     # User submitted feedback for pdf redactions
     pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
+    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
+    success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
     data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
+    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
+    success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
+    ### USAGE LOGS
+    # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
+    usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
         usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
+        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
+        success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
+        success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     else:
         usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
+        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
+        success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
+        success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 if __name__ == "__main__":
     if RUN_DIRECT_MODE == "0":

tools/aws_functions.py CHANGED Viewed

@@ -3,7 +3,7 @@ import pandas as pd
 import boto3
 import tempfile
 import os
-from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
 PandasDataFrame = Type[pd.DataFrame]
 def get_assumed_role_info():
@@ -174,3 +174,59 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
         final_out_message_str = "App not set to run AWS functions"
     return final_out_message_str

 import boto3
 import tempfile
 import os
+from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SAVE_LOGS_TO_CSV
 PandasDataFrame = Type[pd.DataFrame]
 def get_assumed_role_info():
         final_out_message_str = "App not set to run AWS functions"
     return final_out_message_str
+def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS, SAVE_LOGS_TO_CSV:str=SAVE_LOGS_TO_CSV):
+    """
+    Uploads a log file from local machine to Amazon S3.
+    Args:
+    - local_file_path: Local file path(s) of the file(s) to upload.
+    - s3_key: Key (path) to the file in the S3 bucket.
+    - s3_bucket: Name of the S3 bucket.
+    Returns:
+    - Message as variable/printed to console
+    """
+    final_out_message = []
+    final_out_message_str = ""
+    if RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV == "True":
+        try:
+            if s3_bucket and s3_key and local_file_paths:
+                s3_client = boto3.client('s3', region_name=AWS_REGION)
+                if isinstance(local_file_paths, str):
+                    local_file_paths = [local_file_paths]
+                for file in local_file_paths:
+                    if s3_client:
+                        #print(s3_client)
+                        try:
+                            # Get file name off file path
+                            file_name = os.path.basename(file)
+                            s3_key_full = s3_key + file_name
+                            print("S3 key: ", s3_key_full)
+                            s3_client.upload_file(file, s3_bucket, s3_key_full)
+                            out_message = "File " + file_name + " uploaded successfully!"
+                            print(out_message)
+                        except Exception as e:
+                            out_message = f"Error uploading file(s): {e}"
+                            print(out_message)
+                        final_out_message.append(out_message)
+                        final_out_message_str = '\n'.join(final_out_message)
+                    else: final_out_message_str = "Could not connect to AWS."
+            else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
+        except Exception as e:
+            final_out_message_str = "Could not upload files to S3 due to: " + str(e)
+            print(final_out_message_str)
+    else:
+        final_out_message_str = "App not set to run AWS functions"
+    return final_out_message_str

tools/config.py CHANGED Viewed

@@ -108,13 +108,15 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
 DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
-SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
-TEXTRACT_BULK_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_BUCKET', '')
-TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER', 'input')
-TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
 LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
@@ -161,6 +163,8 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
 # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
 USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
 if USE_LOG_SUBFOLDERS == "True":
@@ -181,8 +185,29 @@ ensure_folder_exists(USAGE_LOGS_FOLDER)
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 ###
-# REDACTION CONFIG
 # Create Tesseract and Poppler folders if you have installed them locally
 TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
@@ -226,7 +251,7 @@ ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
 DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
-GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
 ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv

 DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
+### WHOLE DOCUMENT API OPTIONS
+SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET', '')
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER', 'input')
+TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
 LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
 # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
 # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
+SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
 USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
 if USE_LOG_SUBFOLDERS == "True":
 # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
+# Further customisation options for CSV logs
+CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
+CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
+CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox",	"doc_full_file_name_textbox",	"data_full_file_name_textbox",	"actual_time_taken_number",	"total_page_count",	"textract_query_number", "pii_detection_method", "comprehend_query_number",  "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
+### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
+SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
+ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
+DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var('DYNAMODB_ACCESS_LOG_HEADERS', '')
+FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', 'redaction_feedback')
+DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var('DYNAMODB_FEEDBACK_LOG_HEADERS', '')
+USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage')
+DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '')
+###
+# REDACTION
 ###
 # Create Tesseract and Poppler folders if you have installed them locally
 TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
 DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
+GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '')
 ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv

tools/custom_csvlogger.py CHANGED Viewed

@@ -4,6 +4,10 @@ import csv
 import datetime
 import os
 import re
 from collections.abc import Sequence
 from multiprocessing import Lock
 from pathlib import Path
@@ -62,21 +66,28 @@ class CSVLogger_custom(FlaggingCallback):
         self.flagging_dir = Path(flagging_dir)
         self.first_time = True
-    def _create_dataset_file(self, additional_headers: list[str] | None = None):
         os.makedirs(self.flagging_dir, exist_ok=True)
-        if additional_headers is None:
-            additional_headers = []
-        headers = (
-            [
                 getattr(component, "label", None) or f"component {idx}"
                 for idx, component in enumerate(self.components)
-            ]
-            + additional_headers
-            + [
-                "timestamp",
-            ]
-        )
         headers = utils.sanitize_list_for_csv(headers)
         dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
@@ -115,18 +126,24 @@ class CSVLogger_custom(FlaggingCallback):
             print("Using existing dataset file at:", self.dataset_filepath)
     def flag(
-        self,
-        flag_data: list[Any],
-        flag_option: str | None = None,
-        username: str | None = None,
-    ) -> int:
         if self.first_time:
             additional_headers = []
             if flag_option is not None:
                 additional_headers.append("flag")
             if username is not None:
                 additional_headers.append("username")
-            self._create_dataset_file(additional_headers=additional_headers)
             self.first_time = False
         csv_data = []
@@ -155,15 +172,113 @@ class CSVLogger_custom(FlaggingCallback):
             csv_data.append(flag_option)
         if username is not None:
             csv_data.append(username)
-        csv_data.append(str(datetime.datetime.now()))
-        with self.lock:
-            with open(
-                self.dataset_filepath, "a", newline="", encoding="utf-8"
-            ) as csvfile:
-                writer = csv.writer(csvfile)
-                writer.writerow(utils.sanitize_list_for_csv(csv_data))
-            with open(self.dataset_filepath, encoding="utf-8") as csvfile:
-                line_count = len(list(csv.reader(csvfile))) - 1
         return line_count

 import datetime
 import os
 import re
+import boto3
+import botocore
+import uuid
+import time
 from collections.abc import Sequence
 from multiprocessing import Lock
 from pathlib import Path
         self.flagging_dir = Path(flagging_dir)
         self.first_time = True
+    def _create_dataset_file(
+    self,
+    additional_headers: list[str] | None = None,
+    replacement_headers: list[str] | None = None
+):
         os.makedirs(self.flagging_dir, exist_ok=True)
+        if replacement_headers:
+            if len(replacement_headers) != len(self.components):
+                raise ValueError(
+                    f"replacement_headers must have the same length as components "
+                    f"({len(replacement_headers)} provided, {len(self.components)} expected)"
+                )
+            headers = replacement_headers + ["timestamp"]
+        else:
+            if additional_headers is None:
+                additional_headers = []
+            headers = [
                 getattr(component, "label", None) or f"component {idx}"
                 for idx, component in enumerate(self.components)
+            ] + additional_headers + ["timestamp"]
         headers = utils.sanitize_list_for_csv(headers)
         dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
             print("Using existing dataset file at:", self.dataset_filepath)
     def flag(
+    self,
+    flag_data: list[Any],
+    flag_option: str | None = None,
+    username: str | None = None,
+    save_to_csv: bool = True,
+    save_to_dynamodb: bool = False,
+    dynamodb_table_name: str | None = None,
+    dynamodb_headers: list[str] | None = None,  # New: specify headers for DynamoDB
+    replacement_headers: list[str] | None = None
+) -> int:
         if self.first_time:
             additional_headers = []
             if flag_option is not None:
                 additional_headers.append("flag")
             if username is not None:
                 additional_headers.append("username")
+            additional_headers.append("id")
+            self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
             self.first_time = False
         csv_data = []
             csv_data.append(flag_option)
         if username is not None:
             csv_data.append(username)
+        timestamp = str(datetime.datetime.now())
+        csv_data.append(timestamp)
+        generated_id = str(uuid.uuid4())
+        csv_data.append(generated_id)
+        # Build the headers
+        headers = (
+            [getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
+        )
+        if flag_option is not None:
+            headers.append("flag")
+        if username is not None:
+            headers.append("username")
+        headers.append("timestamp")
+        headers.append("id")
+        line_count = -1
+        if save_to_csv:
+            with self.lock:
+                with open(self.dataset_filepath, "a", newline="", encoding="utf-8") as csvfile:
+                    writer = csv.writer(csvfile)
+                    writer.writerow(utils.sanitize_list_for_csv(csv_data))
+                with open(self.dataset_filepath, encoding="utf-8") as csvfile:
+                    line_count = len(list(csv.reader(csvfile))) - 1
+        if save_to_dynamodb == True:
+            if dynamodb_table_name is None:
+                raise ValueError("You must provide a dynamodb_table_name if save_to_dynamodb is True")
+            dynamodb = boto3.resource('dynamodb')
+            client = boto3.client('dynamodb')
+            if dynamodb_headers:
+                dynamodb_headers = dynamodb_headers
+            if not dynamodb_headers and replacement_headers:
+                dynamodb_headers = replacement_headers
+            elif headers:
+                dynamodb_headers = headers
+            elif not dynamodb_headers:
+                raise ValueError("Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table.")
+            if flag_option is not None:
+                if "flag" not in dynamodb_headers:
+                    dynamodb_headers.append("flag")
+                if username is not None:
+                    if "username" not in dynamodb_headers:
+                        dynamodb_headers.append("username")
+                if "timestamp" not in dynamodb_headers:
+                    dynamodb_headers.append("timestamp")
+                if "id" not in dynamodb_headers:
+                    dynamodb_headers.append("id")
+            # Table doesn't exist — create it
+            try:
+                table = dynamodb.Table(dynamodb_table_name)
+                table.load()
+            except botocore.exceptions.ClientError as e:
+                if e.response['Error']['Code'] == 'ResourceNotFoundException':
+                    #print(f"Creating DynamoDB table '{dynamodb_table_name}'...")
+                    #print("dynamodb_headers:", dynamodb_headers)
+                    attribute_definitions = [
+                        {'AttributeName': 'id', 'AttributeType': 'S'}  # Only define key attributes here
+                    ]
+                    table = dynamodb.create_table(
+                        TableName=dynamodb_table_name,
+                        KeySchema=[
+                            {'AttributeName': 'id', 'KeyType': 'HASH'}  # Partition key
+                        ],
+                        AttributeDefinitions=attribute_definitions,
+                        BillingMode='PAY_PER_REQUEST'
+)
+                    # Wait until the table exists
+                    table.meta.client.get_waiter('table_exists').wait(TableName=dynamodb_table_name)
+                    time.sleep(5)
+                    print(f"Table '{dynamodb_table_name}' created successfully.")
+                else:
+                    raise
+            # Prepare the DynamoDB item to upload
+            try:
+                item = {
+                    'id': str(generated_id),  # UUID primary key
+                    #'created_by': username if username else "unknown",
+                    'timestamp': timestamp,
+                }
+                #print("dynamodb_headers:", dynamodb_headers)
+                #print("csv_data:", csv_data)
+                # Map the headers to values
+                item.update({header: str(value) for header, value in zip(dynamodb_headers, csv_data)})
+                #print("item:", item)
+                table.put_item(Item=item)
+                print("Successfully uploaded log to DynamoDB")
+            except Exception as e:
+                print("Could not upload log to DynamobDB due to", e)
         return line_count

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -838,8 +838,7 @@ def combine_ocr_results(ocr_results:dict, x_threshold:float=50.0, y_threshold:fl
                     height=max(current_bbox.height, result.height)
                 )
                 current_line.append(result)
-            else:
                 # Commit the current line and start a new one
                 combined_results.append(current_bbox)

                     height=max(current_bbox.height, result.height)
                 )
                 current_line.append(result)
+            else:
                 # Commit the current line and start a new one
                 combined_results.append(current_bbox)

tools/helper_functions.py CHANGED Viewed

@@ -9,7 +9,7 @@ import unicodedata
 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
-from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
 # Names for options labels
 text_ocr_option = "Local model - selectable text"
@@ -306,8 +306,8 @@ async def get_connection_params(request: gr.Request,
                                 output_folder_textbox:str=OUTPUT_FOLDER,
                                 input_folder_textbox:str=INPUT_FOLDER,
                                 session_output_folder:str=SESSION_OUTPUT_FOLDER,
-                                textract_document_upload_input_folder:str=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER,
-                                textract_document_upload_output_folder:str=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER,
                                 s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
                                 local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):

 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
+from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
 # Names for options labels
 text_ocr_option = "Local model - selectable text"
                                 output_folder_textbox:str=OUTPUT_FOLDER,
                                 input_folder_textbox:str=INPUT_FOLDER,
                                 session_output_folder:str=SESSION_OUTPUT_FOLDER,
+                                textract_document_upload_input_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
+                                textract_document_upload_output_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
                                 s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
                                 local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):

tools/textract_batch_call.py CHANGED Viewed

@@ -10,7 +10,7 @@ from io import StringIO
 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
-from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
 #from tools.aws_textract import json_to_ocrresult
 def analyse_document_with_textract_api(
@@ -18,7 +18,7 @@ def analyse_document_with_textract_api(
     s3_input_prefix: str,
     s3_output_prefix: str,
     job_df:pd.DataFrame,
-    s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
     local_output_dir: str = OUTPUT_FOLDER,
     analyse_signatures:List[str] = [],
     successful_job_number:int=0,
@@ -328,7 +328,7 @@ def poll_bulk_textract_analysis_progress_and_download(
     s3_output_prefix: str,
     pdf_filename:str,
     job_df:pd.DataFrame,
-    s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
     local_output_dir: str = OUTPUT_FOLDER,
     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,

 from urllib.parse import urlparse
 from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
+from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
 #from tools.aws_textract import json_to_ocrresult
 def analyse_document_with_textract_api(
     s3_input_prefix: str,
     s3_output_prefix: str,
     job_df:pd.DataFrame,
+    s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
     local_output_dir: str = OUTPUT_FOLDER,
     analyse_signatures:List[str] = [],
     successful_job_number:int=0,
     s3_output_prefix: str,
     pdf_filename:str,
     job_df:pd.DataFrame,
+    s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
     local_output_dir: str = OUTPUT_FOLDER,
     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,