seanpedrickcase commited on
Commit
0042e78
·
1 Parent(s): 93b4c8a

Improved logging format a little. Now possible to save logs to DynamoDB

Browse files
README.md CHANGED
@@ -426,7 +426,7 @@ When you click the 'convert .xfdf comment file to review_file.csv' button, the a
426
 
427
  ## Using the AWS Textract document API
428
 
429
- This option can be enabled by your system admin, in the config file ('SHOW_BULK_TEXTRACT_CALL_OPTIONS' environment variable, and subsequent variables). Using this, you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here).
430
 
431
  ### Starting a new Textract API job
432
 
 
426
 
427
  ## Using the AWS Textract document API
428
 
429
+ This option can be enabled by your system admin, in the config file ('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS' environment variable, and subsequent variables). Using this, you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here).
430
 
431
  ### Starting a new Textract API job
432
 
app.py CHANGED
@@ -4,9 +4,9 @@ import pandas as pd
4
  import gradio as gr
5
  from gradio_image_annotation import image_annotator
6
 
7
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH
8
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
9
- from tools.aws_functions import upload_file_to_s3, download_file_from_s3
10
  from tools.file_redaction import choose_and_run_redactor
11
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
12
  from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
@@ -44,6 +44,22 @@ else:
44
  default_ocr_val = text_ocr_option
45
  default_pii_detector = local_pii_detector
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  # Create the gradio interface
48
  app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
49
 
@@ -149,9 +165,9 @@ with app:
149
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
150
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
151
 
152
- s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_BULK_ANALYSIS_BUCKET, visible=False)
153
- s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
154
- s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
155
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
156
  no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
157
  textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
@@ -253,7 +269,7 @@ with app:
253
  reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
254
  cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
255
 
256
- if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
257
  with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
258
  with gr.Row(equal_height=True):
259
  gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
@@ -654,7 +670,7 @@ with app:
654
 
655
  # Get connection details on app load
656
 
657
- if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
658
  app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
659
  success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
660
  else:
@@ -681,7 +697,7 @@ with app:
681
  print("Downloading cost codes from S3")
682
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
683
  success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
684
- print("Successfully loaded cost codes from S3")
685
  elif os.path.exists(COST_CODES_PATH):
686
  print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
687
  app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
@@ -691,44 +707,47 @@ with app:
691
  # LOGGING
692
  ###
693
 
 
694
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
695
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
696
- access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
697
-
698
- session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
699
- success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
700
 
 
701
  # User submitted feedback for pdf redactions
702
  pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
703
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
704
- pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
705
- success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
706
 
707
  # User submitted feedback for data redactions
708
  data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
709
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
710
- data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
711
- success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
 
 
 
712
 
713
- # Log processing time/token usage when making a query
714
- usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
715
 
716
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
717
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
718
 
719
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
720
- success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
721
 
722
- successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
723
- success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
724
  else:
725
  usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
726
 
727
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
728
- success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
729
 
730
- successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
731
- success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
732
 
733
  if __name__ == "__main__":
734
  if RUN_DIRECT_MODE == "0":
 
4
  import gradio as gr
5
  from gradio_image_annotation import image_annotator
6
 
7
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS
8
  from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
9
+ from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
10
  from tools.file_redaction import choose_and_run_redactor
11
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
12
  from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
 
44
  default_ocr_val = text_ocr_option
45
  default_pii_detector = local_pii_detector
46
 
47
+ SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
48
+ SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
49
+
50
+ print("SAVE_LOGS_TO_CSV:", SAVE_LOGS_TO_CSV)
51
+ print("SAVE_LOGS_TO_DYNAMODB:", SAVE_LOGS_TO_DYNAMODB)
52
+
53
+ if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = eval(CSV_ACCESS_LOG_HEADERS)
54
+ if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = eval(CSV_FEEDBACK_LOG_HEADERS)
55
+ if CSV_USAGE_LOG_HEADERS: CSV_USAGE_LOG_HEADERS = eval(CSV_USAGE_LOG_HEADERS)
56
+
57
+ if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCESS_LOG_HEADERS)
58
+ if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
59
+ if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
60
+
61
+ print
62
+
63
  # Create the gradio interface
64
  app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
65
 
 
165
  s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
166
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
167
 
168
+ s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
169
+ s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
170
+ s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
171
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
172
  no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
173
  textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
 
269
  reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
270
  cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
271
 
272
+ if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
273
  with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
274
  with gr.Row(equal_height=True):
275
  gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
 
670
 
671
  # Get connection details on app load
672
 
673
+ if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
674
  app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
675
  success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
676
  else:
 
697
  print("Downloading cost codes from S3")
698
  app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
699
  success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
700
+ print("Successfully loaded cost codesc from S3")
701
  elif os.path.exists(COST_CODES_PATH):
702
  print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
703
  app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
 
707
  # LOGGING
708
  ###
709
 
710
+ ### ACCESS LOGS
711
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
712
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
713
+ access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
714
+ session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
715
+ success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 
716
 
717
+ ### FEEDBACK LOGS
718
  # User submitted feedback for pdf redactions
719
  pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
720
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
721
+ pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
722
+ success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
723
 
724
  # User submitted feedback for data redactions
725
  data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
726
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
727
+ data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
728
+ success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
729
+
730
+ ### USAGE LOGS
731
+ # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
732
 
733
+ usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
 
734
 
735
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
736
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
737
 
738
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
739
+ success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
740
 
741
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
742
+ success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
743
  else:
744
  usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
745
 
746
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
747
+ success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
748
 
749
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
750
+ success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
751
 
752
  if __name__ == "__main__":
753
  if RUN_DIRECT_MODE == "0":
tools/aws_functions.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import boto3
4
  import tempfile
5
  import os
6
- from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
  def get_assumed_role_info():
@@ -174,3 +174,59 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
174
  final_out_message_str = "App not set to run AWS functions"
175
 
176
  return final_out_message_str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import boto3
4
  import tempfile
5
  import os
6
+ from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SAVE_LOGS_TO_CSV
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
  def get_assumed_role_info():
 
174
  final_out_message_str = "App not set to run AWS functions"
175
 
176
  return final_out_message_str
177
+
178
+
179
+ def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS, SAVE_LOGS_TO_CSV:str=SAVE_LOGS_TO_CSV):
180
+ """
181
+ Uploads a log file from local machine to Amazon S3.
182
+
183
+ Args:
184
+ - local_file_path: Local file path(s) of the file(s) to upload.
185
+ - s3_key: Key (path) to the file in the S3 bucket.
186
+ - s3_bucket: Name of the S3 bucket.
187
+
188
+ Returns:
189
+ - Message as variable/printed to console
190
+ """
191
+ final_out_message = []
192
+ final_out_message_str = ""
193
+
194
+ if RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV == "True":
195
+ try:
196
+ if s3_bucket and s3_key and local_file_paths:
197
+
198
+ s3_client = boto3.client('s3', region_name=AWS_REGION)
199
+
200
+ if isinstance(local_file_paths, str):
201
+ local_file_paths = [local_file_paths]
202
+
203
+ for file in local_file_paths:
204
+ if s3_client:
205
+ #print(s3_client)
206
+ try:
207
+ # Get file name off file path
208
+ file_name = os.path.basename(file)
209
+
210
+ s3_key_full = s3_key + file_name
211
+ print("S3 key: ", s3_key_full)
212
+
213
+ s3_client.upload_file(file, s3_bucket, s3_key_full)
214
+ out_message = "File " + file_name + " uploaded successfully!"
215
+ print(out_message)
216
+
217
+ except Exception as e:
218
+ out_message = f"Error uploading file(s): {e}"
219
+ print(out_message)
220
+
221
+ final_out_message.append(out_message)
222
+ final_out_message_str = '\n'.join(final_out_message)
223
+
224
+ else: final_out_message_str = "Could not connect to AWS."
225
+ else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
226
+ except Exception as e:
227
+ final_out_message_str = "Could not upload files to S3 due to: " + str(e)
228
+ print(final_out_message_str)
229
+ else:
230
+ final_out_message_str = "App not set to run AWS functions"
231
+
232
+ return final_out_message_str
tools/config.py CHANGED
@@ -108,13 +108,15 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
108
 
109
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
110
 
111
- SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
112
 
113
- TEXTRACT_BULK_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_BUCKET', '')
114
 
115
- TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER', 'input')
116
 
117
- TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
 
 
118
 
119
  LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
120
 
@@ -161,6 +163,8 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
161
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
162
  # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
163
 
 
 
164
  USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
165
 
166
  if USE_LOG_SUBFOLDERS == "True":
@@ -181,8 +185,29 @@ ensure_folder_exists(USAGE_LOGS_FOLDER)
181
  # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
182
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  ###
185
- # REDACTION CONFIG
186
 
187
  # Create Tesseract and Poppler folders if you have installed them locally
188
  TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
@@ -226,7 +251,7 @@ ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
226
 
227
  DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
228
 
229
- GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
230
 
231
  ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
232
 
 
108
 
109
  DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
110
 
111
+ ### WHOLE DOCUMENT API OPTIONS
112
 
113
+ SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
114
 
115
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET', '')
116
 
117
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER', 'input')
118
+
119
+ TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
120
 
121
  LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
122
 
 
163
  # By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
164
  # Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
165
 
166
+ SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
167
+
168
  USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
169
 
170
  if USE_LOG_SUBFOLDERS == "True":
 
185
  # Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
186
  DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
187
 
188
+ # Further customisation options for CSV logs
189
+
190
+ CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
191
+ CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
192
+ CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
193
+
194
+
195
+ ### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
196
+
197
+ SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
198
+
199
+ ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
200
+ DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var('DYNAMODB_ACCESS_LOG_HEADERS', '')
201
+
202
+ FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', 'redaction_feedback')
203
+ DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var('DYNAMODB_FEEDBACK_LOG_HEADERS', '')
204
+
205
+ USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage')
206
+ DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '')
207
+
208
+ ###
209
+ # REDACTION
210
  ###
 
211
 
212
  # Create Tesseract and Poppler folders if you have installed them locally
213
  TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
 
251
 
252
  DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
253
 
254
+ GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '')
255
 
256
  ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
257
 
tools/custom_csvlogger.py CHANGED
@@ -4,6 +4,10 @@ import csv
4
  import datetime
5
  import os
6
  import re
 
 
 
 
7
  from collections.abc import Sequence
8
  from multiprocessing import Lock
9
  from pathlib import Path
@@ -62,21 +66,28 @@ class CSVLogger_custom(FlaggingCallback):
62
  self.flagging_dir = Path(flagging_dir)
63
  self.first_time = True
64
 
65
- def _create_dataset_file(self, additional_headers: list[str] | None = None):
 
 
 
 
66
  os.makedirs(self.flagging_dir, exist_ok=True)
67
 
68
- if additional_headers is None:
69
- additional_headers = []
70
- headers = (
71
- [
 
 
 
 
 
 
 
72
  getattr(component, "label", None) or f"component {idx}"
73
  for idx, component in enumerate(self.components)
74
- ]
75
- + additional_headers
76
- + [
77
- "timestamp",
78
- ]
79
- )
80
  headers = utils.sanitize_list_for_csv(headers)
81
  dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
82
 
@@ -115,18 +126,24 @@ class CSVLogger_custom(FlaggingCallback):
115
  print("Using existing dataset file at:", self.dataset_filepath)
116
 
117
  def flag(
118
- self,
119
- flag_data: list[Any],
120
- flag_option: str | None = None,
121
- username: str | None = None,
122
- ) -> int:
 
 
 
 
 
123
  if self.first_time:
124
  additional_headers = []
125
  if flag_option is not None:
126
  additional_headers.append("flag")
127
  if username is not None:
128
  additional_headers.append("username")
129
- self._create_dataset_file(additional_headers=additional_headers)
 
130
  self.first_time = False
131
 
132
  csv_data = []
@@ -155,15 +172,113 @@ class CSVLogger_custom(FlaggingCallback):
155
  csv_data.append(flag_option)
156
  if username is not None:
157
  csv_data.append(username)
158
- csv_data.append(str(datetime.datetime.now()))
159
 
160
- with self.lock:
161
- with open(
162
- self.dataset_filepath, "a", newline="", encoding="utf-8"
163
- ) as csvfile:
164
- writer = csv.writer(csvfile)
165
- writer.writerow(utils.sanitize_list_for_csv(csv_data))
166
- with open(self.dataset_filepath, encoding="utf-8") as csvfile:
167
- line_count = len(list(csv.reader(csvfile))) - 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
  return line_count
 
4
  import datetime
5
  import os
6
  import re
7
+ import boto3
8
+ import botocore
9
+ import uuid
10
+ import time
11
  from collections.abc import Sequence
12
  from multiprocessing import Lock
13
  from pathlib import Path
 
66
  self.flagging_dir = Path(flagging_dir)
67
  self.first_time = True
68
 
69
+ def _create_dataset_file(
70
+ self,
71
+ additional_headers: list[str] | None = None,
72
+ replacement_headers: list[str] | None = None
73
+ ):
74
  os.makedirs(self.flagging_dir, exist_ok=True)
75
 
76
+ if replacement_headers:
77
+ if len(replacement_headers) != len(self.components):
78
+ raise ValueError(
79
+ f"replacement_headers must have the same length as components "
80
+ f"({len(replacement_headers)} provided, {len(self.components)} expected)"
81
+ )
82
+ headers = replacement_headers + ["timestamp"]
83
+ else:
84
+ if additional_headers is None:
85
+ additional_headers = []
86
+ headers = [
87
  getattr(component, "label", None) or f"component {idx}"
88
  for idx, component in enumerate(self.components)
89
+ ] + additional_headers + ["timestamp"]
90
+
 
 
 
 
91
  headers = utils.sanitize_list_for_csv(headers)
92
  dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
93
 
 
126
  print("Using existing dataset file at:", self.dataset_filepath)
127
 
128
  def flag(
129
+ self,
130
+ flag_data: list[Any],
131
+ flag_option: str | None = None,
132
+ username: str | None = None,
133
+ save_to_csv: bool = True,
134
+ save_to_dynamodb: bool = False,
135
+ dynamodb_table_name: str | None = None,
136
+ dynamodb_headers: list[str] | None = None, # New: specify headers for DynamoDB
137
+ replacement_headers: list[str] | None = None
138
+ ) -> int:
139
  if self.first_time:
140
  additional_headers = []
141
  if flag_option is not None:
142
  additional_headers.append("flag")
143
  if username is not None:
144
  additional_headers.append("username")
145
+ additional_headers.append("id")
146
+ self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
147
  self.first_time = False
148
 
149
  csv_data = []
 
172
  csv_data.append(flag_option)
173
  if username is not None:
174
  csv_data.append(username)
 
175
 
176
+
177
+ timestamp = str(datetime.datetime.now())
178
+ csv_data.append(timestamp)
179
+
180
+ generated_id = str(uuid.uuid4())
181
+ csv_data.append(generated_id)
182
+
183
+ # Build the headers
184
+ headers = (
185
+ [getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
186
+ )
187
+ if flag_option is not None:
188
+ headers.append("flag")
189
+ if username is not None:
190
+ headers.append("username")
191
+ headers.append("timestamp")
192
+ headers.append("id")
193
+
194
+ line_count = -1
195
+
196
+ if save_to_csv:
197
+ with self.lock:
198
+ with open(self.dataset_filepath, "a", newline="", encoding="utf-8") as csvfile:
199
+ writer = csv.writer(csvfile)
200
+ writer.writerow(utils.sanitize_list_for_csv(csv_data))
201
+ with open(self.dataset_filepath, encoding="utf-8") as csvfile:
202
+ line_count = len(list(csv.reader(csvfile))) - 1
203
+
204
+ if save_to_dynamodb == True:
205
+ if dynamodb_table_name is None:
206
+ raise ValueError("You must provide a dynamodb_table_name if save_to_dynamodb is True")
207
+
208
+ dynamodb = boto3.resource('dynamodb')
209
+ client = boto3.client('dynamodb')
210
+
211
+
212
+ if dynamodb_headers:
213
+ dynamodb_headers = dynamodb_headers
214
+ if not dynamodb_headers and replacement_headers:
215
+ dynamodb_headers = replacement_headers
216
+ elif headers:
217
+ dynamodb_headers = headers
218
+ elif not dynamodb_headers:
219
+ raise ValueError("Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table.")
220
+
221
+ if flag_option is not None:
222
+ if "flag" not in dynamodb_headers:
223
+ dynamodb_headers.append("flag")
224
+ if username is not None:
225
+ if "username" not in dynamodb_headers:
226
+ dynamodb_headers.append("username")
227
+ if "timestamp" not in dynamodb_headers:
228
+ dynamodb_headers.append("timestamp")
229
+ if "id" not in dynamodb_headers:
230
+ dynamodb_headers.append("id")
231
+
232
+ # Table doesn't exist — create it
233
+ try:
234
+ table = dynamodb.Table(dynamodb_table_name)
235
+ table.load()
236
+ except botocore.exceptions.ClientError as e:
237
+ if e.response['Error']['Code'] == 'ResourceNotFoundException':
238
+
239
+ #print(f"Creating DynamoDB table '{dynamodb_table_name}'...")
240
+ #print("dynamodb_headers:", dynamodb_headers)
241
+
242
+ attribute_definitions = [
243
+ {'AttributeName': 'id', 'AttributeType': 'S'} # Only define key attributes here
244
+ ]
245
+
246
+ table = dynamodb.create_table(
247
+ TableName=dynamodb_table_name,
248
+ KeySchema=[
249
+ {'AttributeName': 'id', 'KeyType': 'HASH'} # Partition key
250
+ ],
251
+ AttributeDefinitions=attribute_definitions,
252
+ BillingMode='PAY_PER_REQUEST'
253
+ )
254
+ # Wait until the table exists
255
+ table.meta.client.get_waiter('table_exists').wait(TableName=dynamodb_table_name)
256
+ time.sleep(5)
257
+ print(f"Table '{dynamodb_table_name}' created successfully.")
258
+ else:
259
+ raise
260
+
261
+ # Prepare the DynamoDB item to upload
262
+
263
+ try:
264
+ item = {
265
+ 'id': str(generated_id), # UUID primary key
266
+ #'created_by': username if username else "unknown",
267
+ 'timestamp': timestamp,
268
+ }
269
+
270
+ #print("dynamodb_headers:", dynamodb_headers)
271
+ #print("csv_data:", csv_data)
272
+
273
+ # Map the headers to values
274
+ item.update({header: str(value) for header, value in zip(dynamodb_headers, csv_data)})
275
+
276
+ #print("item:", item)
277
+
278
+ table.put_item(Item=item)
279
+
280
+ print("Successfully uploaded log to DynamoDB")
281
+ except Exception as e:
282
+ print("Could not upload log to DynamobDB due to", e)
283
 
284
  return line_count
tools/custom_image_analyser_engine.py CHANGED
@@ -838,8 +838,7 @@ def combine_ocr_results(ocr_results:dict, x_threshold:float=50.0, y_threshold:fl
838
  height=max(current_bbox.height, result.height)
839
  )
840
  current_line.append(result)
841
- else:
842
-
843
 
844
  # Commit the current line and start a new one
845
  combined_results.append(current_bbox)
 
838
  height=max(current_bbox.height, result.height)
839
  )
840
  current_line.append(result)
841
+ else:
 
842
 
843
  # Commit the current line and start a new one
844
  combined_results.append(current_bbox)
tools/helper_functions.py CHANGED
@@ -9,7 +9,7 @@ import unicodedata
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
- from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
@@ -306,8 +306,8 @@ async def get_connection_params(request: gr.Request,
306
  output_folder_textbox:str=OUTPUT_FOLDER,
307
  input_folder_textbox:str=INPUT_FOLDER,
308
  session_output_folder:str=SESSION_OUTPUT_FOLDER,
309
- textract_document_upload_input_folder:str=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER,
310
- textract_document_upload_output_folder:str=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER,
311
  s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
312
  local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
313
 
 
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
 
306
  output_folder_textbox:str=OUTPUT_FOLDER,
307
  input_folder_textbox:str=INPUT_FOLDER,
308
  session_output_folder:str=SESSION_OUTPUT_FOLDER,
309
+ textract_document_upload_input_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
310
+ textract_document_upload_output_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
311
  s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
312
  local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
313
 
tools/textract_batch_call.py CHANGED
@@ -10,7 +10,7 @@ from io import StringIO
10
  from urllib.parse import urlparse
11
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
 
13
- from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
14
  #from tools.aws_textract import json_to_ocrresult
15
 
16
  def analyse_document_with_textract_api(
@@ -18,7 +18,7 @@ def analyse_document_with_textract_api(
18
  s3_input_prefix: str,
19
  s3_output_prefix: str,
20
  job_df:pd.DataFrame,
21
- s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
22
  local_output_dir: str = OUTPUT_FOLDER,
23
  analyse_signatures:List[str] = [],
24
  successful_job_number:int=0,
@@ -328,7 +328,7 @@ def poll_bulk_textract_analysis_progress_and_download(
328
  s3_output_prefix: str,
329
  pdf_filename:str,
330
  job_df:pd.DataFrame,
331
- s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
332
  local_output_dir: str = OUTPUT_FOLDER,
333
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
334
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
 
10
  from urllib.parse import urlparse
11
  from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
12
 
13
+ from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
14
  #from tools.aws_textract import json_to_ocrresult
15
 
16
  def analyse_document_with_textract_api(
 
18
  s3_input_prefix: str,
19
  s3_output_prefix: str,
20
  job_df:pd.DataFrame,
21
+ s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
22
  local_output_dir: str = OUTPUT_FOLDER,
23
  analyse_signatures:List[str] = [],
24
  successful_job_number:int=0,
 
328
  s3_output_prefix: str,
329
  pdf_filename:str,
330
  job_df:pd.DataFrame,
331
+ s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
332
  local_output_dir: str = OUTPUT_FOLDER,
333
  load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
334
  load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,