Commit
·
0042e78
1
Parent(s):
93b4c8a
Improved logging format a little. Now possible to save logs to DynamoDB
Browse files- README.md +1 -1
- app.py +45 -26
- tools/aws_functions.py +57 -1
- tools/config.py +31 -6
- tools/custom_csvlogger.py +141 -26
- tools/custom_image_analyser_engine.py +1 -2
- tools/helper_functions.py +3 -3
- tools/textract_batch_call.py +3 -3
README.md
CHANGED
@@ -426,7 +426,7 @@ When you click the 'convert .xfdf comment file to review_file.csv' button, the a
|
|
426 |
|
427 |
## Using the AWS Textract document API
|
428 |
|
429 |
-
This option can be enabled by your system admin, in the config file ('
|
430 |
|
431 |
### Starting a new Textract API job
|
432 |
|
|
|
426 |
|
427 |
## Using the AWS Textract document API
|
428 |
|
429 |
+
This option can be enabled by your system admin, in the config file ('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS' environment variable, and subsequent variables). Using this, you will have the option to submit whole documents in quick succession to the AWS Textract service to get extracted text outputs quickly (faster than using the 'Redact document' process described here).
|
430 |
|
431 |
### Starting a new Textract API job
|
432 |
|
app.py
CHANGED
@@ -4,9 +4,9 @@ import pandas as pd
|
|
4 |
import gradio as gr
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
|
7 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET,
|
8 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
|
9 |
-
from tools.aws_functions import upload_file_to_s3, download_file_from_s3
|
10 |
from tools.file_redaction import choose_and_run_redactor
|
11 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
12 |
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
|
@@ -44,6 +44,22 @@ else:
|
|
44 |
default_ocr_val = text_ocr_option
|
45 |
default_pii_detector = local_pii_detector
|
46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
# Create the gradio interface
|
48 |
app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
|
49 |
|
@@ -149,9 +165,9 @@ with app:
|
|
149 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
|
150 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
|
151 |
|
152 |
-
s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=
|
153 |
-
s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=
|
154 |
-
s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=
|
155 |
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
156 |
no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
|
157 |
textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
|
@@ -253,7 +269,7 @@ with app:
|
|
253 |
reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
|
254 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
|
255 |
|
256 |
-
if
|
257 |
with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
|
258 |
with gr.Row(equal_height=True):
|
259 |
gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
|
@@ -654,7 +670,7 @@ with app:
|
|
654 |
|
655 |
# Get connection details on app load
|
656 |
|
657 |
-
if
|
658 |
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
|
659 |
success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
|
660 |
else:
|
@@ -681,7 +697,7 @@ with app:
|
|
681 |
print("Downloading cost codes from S3")
|
682 |
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
|
683 |
success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
684 |
-
print("Successfully loaded cost
|
685 |
elif os.path.exists(COST_CODES_PATH):
|
686 |
print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
|
687 |
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
@@ -691,44 +707,47 @@ with app:
|
|
691 |
# LOGGING
|
692 |
###
|
693 |
|
|
|
694 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
695 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
696 |
-
access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
|
697 |
-
|
698 |
-
|
699 |
-
success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
700 |
|
|
|
701 |
# User submitted feedback for pdf redactions
|
702 |
pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
703 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
704 |
-
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
705 |
-
success(fn =
|
706 |
|
707 |
# User submitted feedback for data redactions
|
708 |
data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
709 |
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
|
710 |
-
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
|
711 |
-
success(fn =
|
|
|
|
|
|
|
712 |
|
713 |
-
|
714 |
-
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
715 |
|
716 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
717 |
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
718 |
|
719 |
-
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
720 |
-
success(fn =
|
721 |
|
722 |
-
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
723 |
-
success(fn =
|
724 |
else:
|
725 |
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
726 |
|
727 |
-
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
728 |
-
success(fn =
|
729 |
|
730 |
-
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
731 |
-
success(fn =
|
732 |
|
733 |
if __name__ == "__main__":
|
734 |
if RUN_DIRECT_MODE == "0":
|
|
|
4 |
import gradio as gr
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
|
7 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS
|
8 |
from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
|
9 |
+
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
|
10 |
from tools.file_redaction import choose_and_run_redactor
|
11 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
12 |
from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
|
|
|
44 |
default_ocr_val = text_ocr_option
|
45 |
default_pii_detector = local_pii_detector
|
46 |
|
47 |
+
SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
|
48 |
+
SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
|
49 |
+
|
50 |
+
print("SAVE_LOGS_TO_CSV:", SAVE_LOGS_TO_CSV)
|
51 |
+
print("SAVE_LOGS_TO_DYNAMODB:", SAVE_LOGS_TO_DYNAMODB)
|
52 |
+
|
53 |
+
if CSV_ACCESS_LOG_HEADERS: CSV_ACCESS_LOG_HEADERS = eval(CSV_ACCESS_LOG_HEADERS)
|
54 |
+
if CSV_FEEDBACK_LOG_HEADERS: CSV_FEEDBACK_LOG_HEADERS = eval(CSV_FEEDBACK_LOG_HEADERS)
|
55 |
+
if CSV_USAGE_LOG_HEADERS: CSV_USAGE_LOG_HEADERS = eval(CSV_USAGE_LOG_HEADERS)
|
56 |
+
|
57 |
+
if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCESS_LOG_HEADERS)
|
58 |
+
if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
|
59 |
+
if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
|
60 |
+
|
61 |
+
print
|
62 |
+
|
63 |
# Create the gradio interface
|
64 |
app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
|
65 |
|
|
|
165 |
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
|
166 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
|
167 |
|
168 |
+
s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, visible=False)
|
169 |
+
s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
170 |
+
s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
|
171 |
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
172 |
no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
|
173 |
textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
|
|
|
269 |
reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
|
270 |
cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
|
271 |
|
272 |
+
if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
|
273 |
with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
|
274 |
with gr.Row(equal_height=True):
|
275 |
gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
|
|
|
670 |
|
671 |
# Get connection details on app load
|
672 |
|
673 |
+
if SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS == "True":
|
674 |
app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
|
675 |
success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
|
676 |
else:
|
|
|
697 |
print("Downloading cost codes from S3")
|
698 |
app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
|
699 |
success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
700 |
+
print("Successfully loaded cost codesc from S3")
|
701 |
elif os.path.exists(COST_CODES_PATH):
|
702 |
print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
|
703 |
app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
|
|
|
707 |
# LOGGING
|
708 |
###
|
709 |
|
710 |
+
### ACCESS LOGS
|
711 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
712 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
713 |
+
access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
|
714 |
+
session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
|
715 |
+
success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
|
|
716 |
|
717 |
+
### FEEDBACK LOGS
|
718 |
# User submitted feedback for pdf redactions
|
719 |
pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
720 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
|
721 |
+
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
722 |
+
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
723 |
|
724 |
# User submitted feedback for data redactions
|
725 |
data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
726 |
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
|
727 |
+
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
|
728 |
+
success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
729 |
+
|
730 |
+
### USAGE LOGS
|
731 |
+
# Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
|
732 |
|
733 |
+
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
|
|
734 |
|
735 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
736 |
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
737 |
|
738 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
739 |
+
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
740 |
|
741 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
742 |
+
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
743 |
else:
|
744 |
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
745 |
|
746 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
747 |
+
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
748 |
|
749 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=USAGE_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_USAGE_LOG_HEADERS, replacement_headers=CSV_USAGE_LOG_HEADERS), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
750 |
+
success(fn = upload_log_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
751 |
|
752 |
if __name__ == "__main__":
|
753 |
if RUN_DIRECT_MODE == "0":
|
tools/aws_functions.py
CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
|
|
3 |
import boto3
|
4 |
import tempfile
|
5 |
import os
|
6 |
-
from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
def get_assumed_role_info():
|
@@ -174,3 +174,59 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
|
|
174 |
final_out_message_str = "App not set to run AWS functions"
|
175 |
|
176 |
return final_out_message_str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import boto3
|
4 |
import tempfile
|
5 |
import os
|
6 |
+
from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SAVE_LOGS_TO_CSV
|
7 |
PandasDataFrame = Type[pd.DataFrame]
|
8 |
|
9 |
def get_assumed_role_info():
|
|
|
174 |
final_out_message_str = "App not set to run AWS functions"
|
175 |
|
176 |
return final_out_message_str
|
177 |
+
|
178 |
+
|
179 |
+
def upload_log_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS, SAVE_LOGS_TO_CSV:str=SAVE_LOGS_TO_CSV):
|
180 |
+
"""
|
181 |
+
Uploads a log file from local machine to Amazon S3.
|
182 |
+
|
183 |
+
Args:
|
184 |
+
- local_file_path: Local file path(s) of the file(s) to upload.
|
185 |
+
- s3_key: Key (path) to the file in the S3 bucket.
|
186 |
+
- s3_bucket: Name of the S3 bucket.
|
187 |
+
|
188 |
+
Returns:
|
189 |
+
- Message as variable/printed to console
|
190 |
+
"""
|
191 |
+
final_out_message = []
|
192 |
+
final_out_message_str = ""
|
193 |
+
|
194 |
+
if RUN_AWS_FUNCTIONS == "1" and SAVE_LOGS_TO_CSV == "True":
|
195 |
+
try:
|
196 |
+
if s3_bucket and s3_key and local_file_paths:
|
197 |
+
|
198 |
+
s3_client = boto3.client('s3', region_name=AWS_REGION)
|
199 |
+
|
200 |
+
if isinstance(local_file_paths, str):
|
201 |
+
local_file_paths = [local_file_paths]
|
202 |
+
|
203 |
+
for file in local_file_paths:
|
204 |
+
if s3_client:
|
205 |
+
#print(s3_client)
|
206 |
+
try:
|
207 |
+
# Get file name off file path
|
208 |
+
file_name = os.path.basename(file)
|
209 |
+
|
210 |
+
s3_key_full = s3_key + file_name
|
211 |
+
print("S3 key: ", s3_key_full)
|
212 |
+
|
213 |
+
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
214 |
+
out_message = "File " + file_name + " uploaded successfully!"
|
215 |
+
print(out_message)
|
216 |
+
|
217 |
+
except Exception as e:
|
218 |
+
out_message = f"Error uploading file(s): {e}"
|
219 |
+
print(out_message)
|
220 |
+
|
221 |
+
final_out_message.append(out_message)
|
222 |
+
final_out_message_str = '\n'.join(final_out_message)
|
223 |
+
|
224 |
+
else: final_out_message_str = "Could not connect to AWS."
|
225 |
+
else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
|
226 |
+
except Exception as e:
|
227 |
+
final_out_message_str = "Could not upload files to S3 due to: " + str(e)
|
228 |
+
print(final_out_message_str)
|
229 |
+
else:
|
230 |
+
final_out_message_str = "App not set to run AWS functions"
|
231 |
+
|
232 |
+
return final_out_message_str
|
tools/config.py
CHANGED
@@ -108,13 +108,15 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
|
|
108 |
|
109 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
110 |
|
111 |
-
|
112 |
|
113 |
-
|
114 |
|
115 |
-
|
116 |
|
117 |
-
|
|
|
|
|
118 |
|
119 |
LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
|
120 |
|
@@ -161,6 +163,8 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
|
161 |
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
162 |
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
163 |
|
|
|
|
|
164 |
USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
|
165 |
|
166 |
if USE_LOG_SUBFOLDERS == "True":
|
@@ -181,8 +185,29 @@ ensure_folder_exists(USAGE_LOGS_FOLDER)
|
|
181 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
182 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
###
|
185 |
-
# REDACTION CONFIG
|
186 |
|
187 |
# Create Tesseract and Poppler folders if you have installed them locally
|
188 |
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
|
@@ -226,7 +251,7 @@ ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
|
|
226 |
|
227 |
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
|
228 |
|
229 |
-
GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '
|
230 |
|
231 |
ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
|
232 |
|
|
|
108 |
|
109 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
110 |
|
111 |
+
### WHOLE DOCUMENT API OPTIONS
|
112 |
|
113 |
+
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
|
114 |
|
115 |
+
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET', '')
|
116 |
|
117 |
+
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER', 'input')
|
118 |
+
|
119 |
+
TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
|
120 |
|
121 |
LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
|
122 |
|
|
|
163 |
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
164 |
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
165 |
|
166 |
+
SAVE_LOGS_TO_CSV = get_or_create_env_var('SAVE_LOGS_TO_CSV', 'True')
|
167 |
+
|
168 |
USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
|
169 |
|
170 |
if USE_LOG_SUBFOLDERS == "True":
|
|
|
185 |
# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
|
186 |
DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
|
187 |
|
188 |
+
# Further customisation options for CSV logs
|
189 |
+
|
190 |
+
CSV_ACCESS_LOG_HEADERS = get_or_create_env_var('CSV_ACCESS_LOG_HEADERS', '') # If blank, uses component labels
|
191 |
+
CSV_FEEDBACK_LOG_HEADERS = get_or_create_env_var('CSV_FEEDBACK_LOG_HEADERS', '') # If blank, uses component labels
|
192 |
+
CSV_USAGE_LOG_HEADERS = get_or_create_env_var('CSV_USAGE_LOG_HEADERS', '["session_hash_textbox", "doc_full_file_name_textbox", "data_full_file_name_textbox", "actual_time_taken_number", "total_page_count", "textract_query_number", "pii_detection_method", "comprehend_query_number", "cost_code", "textract_handwriting_signature", "host_name_textbox", "text_extraction_method", "is_this_a_textract_api_call"]') # If blank, uses component labels
|
193 |
+
|
194 |
+
|
195 |
+
### DYNAMODB logs. Whether to save to DynamoDB, and the headers of the table
|
196 |
+
|
197 |
+
SAVE_LOGS_TO_DYNAMODB = get_or_create_env_var('SAVE_LOGS_TO_DYNAMODB', 'False')
|
198 |
+
|
199 |
+
ACCESS_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('ACCESS_LOG_DYNAMODB_TABLE_NAME', 'redaction_access_log')
|
200 |
+
DYNAMODB_ACCESS_LOG_HEADERS = get_or_create_env_var('DYNAMODB_ACCESS_LOG_HEADERS', '')
|
201 |
+
|
202 |
+
FEEDBACK_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('FEEDBACK_LOG_DYNAMODB_TABLE_NAME', 'redaction_feedback')
|
203 |
+
DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var('DYNAMODB_FEEDBACK_LOG_HEADERS', '')
|
204 |
+
|
205 |
+
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage')
|
206 |
+
DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '')
|
207 |
+
|
208 |
+
###
|
209 |
+
# REDACTION
|
210 |
###
|
|
|
211 |
|
212 |
# Create Tesseract and Poppler folders if you have installed them locally
|
213 |
TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
|
|
|
251 |
|
252 |
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '3')
|
253 |
|
254 |
+
GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', '')
|
255 |
|
256 |
ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_allow_list.csv
|
257 |
|
tools/custom_csvlogger.py
CHANGED
@@ -4,6 +4,10 @@ import csv
|
|
4 |
import datetime
|
5 |
import os
|
6 |
import re
|
|
|
|
|
|
|
|
|
7 |
from collections.abc import Sequence
|
8 |
from multiprocessing import Lock
|
9 |
from pathlib import Path
|
@@ -62,21 +66,28 @@ class CSVLogger_custom(FlaggingCallback):
|
|
62 |
self.flagging_dir = Path(flagging_dir)
|
63 |
self.first_time = True
|
64 |
|
65 |
-
def _create_dataset_file(
|
|
|
|
|
|
|
|
|
66 |
os.makedirs(self.flagging_dir, exist_ok=True)
|
67 |
|
68 |
-
if
|
69 |
-
|
70 |
-
|
71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
getattr(component, "label", None) or f"component {idx}"
|
73 |
for idx, component in enumerate(self.components)
|
74 |
-
]
|
75 |
-
|
76 |
-
+ [
|
77 |
-
"timestamp",
|
78 |
-
]
|
79 |
-
)
|
80 |
headers = utils.sanitize_list_for_csv(headers)
|
81 |
dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
|
82 |
|
@@ -115,18 +126,24 @@ class CSVLogger_custom(FlaggingCallback):
|
|
115 |
print("Using existing dataset file at:", self.dataset_filepath)
|
116 |
|
117 |
def flag(
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
123 |
if self.first_time:
|
124 |
additional_headers = []
|
125 |
if flag_option is not None:
|
126 |
additional_headers.append("flag")
|
127 |
if username is not None:
|
128 |
additional_headers.append("username")
|
129 |
-
|
|
|
130 |
self.first_time = False
|
131 |
|
132 |
csv_data = []
|
@@ -155,15 +172,113 @@ class CSVLogger_custom(FlaggingCallback):
|
|
155 |
csv_data.append(flag_option)
|
156 |
if username is not None:
|
157 |
csv_data.append(username)
|
158 |
-
csv_data.append(str(datetime.datetime.now()))
|
159 |
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
168 |
|
169 |
return line_count
|
|
|
4 |
import datetime
|
5 |
import os
|
6 |
import re
|
7 |
+
import boto3
|
8 |
+
import botocore
|
9 |
+
import uuid
|
10 |
+
import time
|
11 |
from collections.abc import Sequence
|
12 |
from multiprocessing import Lock
|
13 |
from pathlib import Path
|
|
|
66 |
self.flagging_dir = Path(flagging_dir)
|
67 |
self.first_time = True
|
68 |
|
69 |
+
def _create_dataset_file(
|
70 |
+
self,
|
71 |
+
additional_headers: list[str] | None = None,
|
72 |
+
replacement_headers: list[str] | None = None
|
73 |
+
):
|
74 |
os.makedirs(self.flagging_dir, exist_ok=True)
|
75 |
|
76 |
+
if replacement_headers:
|
77 |
+
if len(replacement_headers) != len(self.components):
|
78 |
+
raise ValueError(
|
79 |
+
f"replacement_headers must have the same length as components "
|
80 |
+
f"({len(replacement_headers)} provided, {len(self.components)} expected)"
|
81 |
+
)
|
82 |
+
headers = replacement_headers + ["timestamp"]
|
83 |
+
else:
|
84 |
+
if additional_headers is None:
|
85 |
+
additional_headers = []
|
86 |
+
headers = [
|
87 |
getattr(component, "label", None) or f"component {idx}"
|
88 |
for idx, component in enumerate(self.components)
|
89 |
+
] + additional_headers + ["timestamp"]
|
90 |
+
|
|
|
|
|
|
|
|
|
91 |
headers = utils.sanitize_list_for_csv(headers)
|
92 |
dataset_files = list(Path(self.flagging_dir).glob("dataset*.csv"))
|
93 |
|
|
|
126 |
print("Using existing dataset file at:", self.dataset_filepath)
|
127 |
|
128 |
def flag(
|
129 |
+
self,
|
130 |
+
flag_data: list[Any],
|
131 |
+
flag_option: str | None = None,
|
132 |
+
username: str | None = None,
|
133 |
+
save_to_csv: bool = True,
|
134 |
+
save_to_dynamodb: bool = False,
|
135 |
+
dynamodb_table_name: str | None = None,
|
136 |
+
dynamodb_headers: list[str] | None = None, # New: specify headers for DynamoDB
|
137 |
+
replacement_headers: list[str] | None = None
|
138 |
+
) -> int:
|
139 |
if self.first_time:
|
140 |
additional_headers = []
|
141 |
if flag_option is not None:
|
142 |
additional_headers.append("flag")
|
143 |
if username is not None:
|
144 |
additional_headers.append("username")
|
145 |
+
additional_headers.append("id")
|
146 |
+
self._create_dataset_file(additional_headers=additional_headers, replacement_headers=replacement_headers)
|
147 |
self.first_time = False
|
148 |
|
149 |
csv_data = []
|
|
|
172 |
csv_data.append(flag_option)
|
173 |
if username is not None:
|
174 |
csv_data.append(username)
|
|
|
175 |
|
176 |
+
|
177 |
+
timestamp = str(datetime.datetime.now())
|
178 |
+
csv_data.append(timestamp)
|
179 |
+
|
180 |
+
generated_id = str(uuid.uuid4())
|
181 |
+
csv_data.append(generated_id)
|
182 |
+
|
183 |
+
# Build the headers
|
184 |
+
headers = (
|
185 |
+
[getattr(component, "label", None) or f"component {idx}" for idx, component in enumerate(self.components)]
|
186 |
+
)
|
187 |
+
if flag_option is not None:
|
188 |
+
headers.append("flag")
|
189 |
+
if username is not None:
|
190 |
+
headers.append("username")
|
191 |
+
headers.append("timestamp")
|
192 |
+
headers.append("id")
|
193 |
+
|
194 |
+
line_count = -1
|
195 |
+
|
196 |
+
if save_to_csv:
|
197 |
+
with self.lock:
|
198 |
+
with open(self.dataset_filepath, "a", newline="", encoding="utf-8") as csvfile:
|
199 |
+
writer = csv.writer(csvfile)
|
200 |
+
writer.writerow(utils.sanitize_list_for_csv(csv_data))
|
201 |
+
with open(self.dataset_filepath, encoding="utf-8") as csvfile:
|
202 |
+
line_count = len(list(csv.reader(csvfile))) - 1
|
203 |
+
|
204 |
+
if save_to_dynamodb == True:
|
205 |
+
if dynamodb_table_name is None:
|
206 |
+
raise ValueError("You must provide a dynamodb_table_name if save_to_dynamodb is True")
|
207 |
+
|
208 |
+
dynamodb = boto3.resource('dynamodb')
|
209 |
+
client = boto3.client('dynamodb')
|
210 |
+
|
211 |
+
|
212 |
+
if dynamodb_headers:
|
213 |
+
dynamodb_headers = dynamodb_headers
|
214 |
+
if not dynamodb_headers and replacement_headers:
|
215 |
+
dynamodb_headers = replacement_headers
|
216 |
+
elif headers:
|
217 |
+
dynamodb_headers = headers
|
218 |
+
elif not dynamodb_headers:
|
219 |
+
raise ValueError("Headers not found. You must provide dynamodb_headers or replacement_headers to create a new table.")
|
220 |
+
|
221 |
+
if flag_option is not None:
|
222 |
+
if "flag" not in dynamodb_headers:
|
223 |
+
dynamodb_headers.append("flag")
|
224 |
+
if username is not None:
|
225 |
+
if "username" not in dynamodb_headers:
|
226 |
+
dynamodb_headers.append("username")
|
227 |
+
if "timestamp" not in dynamodb_headers:
|
228 |
+
dynamodb_headers.append("timestamp")
|
229 |
+
if "id" not in dynamodb_headers:
|
230 |
+
dynamodb_headers.append("id")
|
231 |
+
|
232 |
+
# Table doesn't exist — create it
|
233 |
+
try:
|
234 |
+
table = dynamodb.Table(dynamodb_table_name)
|
235 |
+
table.load()
|
236 |
+
except botocore.exceptions.ClientError as e:
|
237 |
+
if e.response['Error']['Code'] == 'ResourceNotFoundException':
|
238 |
+
|
239 |
+
#print(f"Creating DynamoDB table '{dynamodb_table_name}'...")
|
240 |
+
#print("dynamodb_headers:", dynamodb_headers)
|
241 |
+
|
242 |
+
attribute_definitions = [
|
243 |
+
{'AttributeName': 'id', 'AttributeType': 'S'} # Only define key attributes here
|
244 |
+
]
|
245 |
+
|
246 |
+
table = dynamodb.create_table(
|
247 |
+
TableName=dynamodb_table_name,
|
248 |
+
KeySchema=[
|
249 |
+
{'AttributeName': 'id', 'KeyType': 'HASH'} # Partition key
|
250 |
+
],
|
251 |
+
AttributeDefinitions=attribute_definitions,
|
252 |
+
BillingMode='PAY_PER_REQUEST'
|
253 |
+
)
|
254 |
+
# Wait until the table exists
|
255 |
+
table.meta.client.get_waiter('table_exists').wait(TableName=dynamodb_table_name)
|
256 |
+
time.sleep(5)
|
257 |
+
print(f"Table '{dynamodb_table_name}' created successfully.")
|
258 |
+
else:
|
259 |
+
raise
|
260 |
+
|
261 |
+
# Prepare the DynamoDB item to upload
|
262 |
+
|
263 |
+
try:
|
264 |
+
item = {
|
265 |
+
'id': str(generated_id), # UUID primary key
|
266 |
+
#'created_by': username if username else "unknown",
|
267 |
+
'timestamp': timestamp,
|
268 |
+
}
|
269 |
+
|
270 |
+
#print("dynamodb_headers:", dynamodb_headers)
|
271 |
+
#print("csv_data:", csv_data)
|
272 |
+
|
273 |
+
# Map the headers to values
|
274 |
+
item.update({header: str(value) for header, value in zip(dynamodb_headers, csv_data)})
|
275 |
+
|
276 |
+
#print("item:", item)
|
277 |
+
|
278 |
+
table.put_item(Item=item)
|
279 |
+
|
280 |
+
print("Successfully uploaded log to DynamoDB")
|
281 |
+
except Exception as e:
|
282 |
+
print("Could not upload log to DynamobDB due to", e)
|
283 |
|
284 |
return line_count
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -838,8 +838,7 @@ def combine_ocr_results(ocr_results:dict, x_threshold:float=50.0, y_threshold:fl
|
|
838 |
height=max(current_bbox.height, result.height)
|
839 |
)
|
840 |
current_line.append(result)
|
841 |
-
else:
|
842 |
-
|
843 |
|
844 |
# Commit the current line and start a new one
|
845 |
combined_results.append(current_bbox)
|
|
|
838 |
height=max(current_bbox.height, result.height)
|
839 |
)
|
840 |
current_line.append(result)
|
841 |
+
else:
|
|
|
842 |
|
843 |
# Commit the current line and start a new one
|
844 |
combined_results.append(current_bbox)
|
tools/helper_functions.py
CHANGED
@@ -9,7 +9,7 @@ import unicodedata
|
|
9 |
from typing import List
|
10 |
from math import ceil
|
11 |
from gradio_image_annotation import image_annotator
|
12 |
-
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID,
|
13 |
|
14 |
# Names for options labels
|
15 |
text_ocr_option = "Local model - selectable text"
|
@@ -306,8 +306,8 @@ async def get_connection_params(request: gr.Request,
|
|
306 |
output_folder_textbox:str=OUTPUT_FOLDER,
|
307 |
input_folder_textbox:str=INPUT_FOLDER,
|
308 |
session_output_folder:str=SESSION_OUTPUT_FOLDER,
|
309 |
-
textract_document_upload_input_folder:str=
|
310 |
-
textract_document_upload_output_folder:str=
|
311 |
s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
|
312 |
local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
|
313 |
|
|
|
9 |
from typing import List
|
10 |
from math import ceil
|
11 |
from gradio_image_annotation import image_annotator
|
12 |
+
from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
|
13 |
|
14 |
# Names for options labels
|
15 |
text_ocr_option = "Local model - selectable text"
|
|
|
306 |
output_folder_textbox:str=OUTPUT_FOLDER,
|
307 |
input_folder_textbox:str=INPUT_FOLDER,
|
308 |
session_output_folder:str=SESSION_OUTPUT_FOLDER,
|
309 |
+
textract_document_upload_input_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER,
|
310 |
+
textract_document_upload_output_folder:str=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER,
|
311 |
s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
|
312 |
local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
|
313 |
|
tools/textract_batch_call.py
CHANGED
@@ -10,7 +10,7 @@ from io import StringIO
|
|
10 |
from urllib.parse import urlparse
|
11 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
12 |
|
13 |
-
from tools.config import
|
14 |
#from tools.aws_textract import json_to_ocrresult
|
15 |
|
16 |
def analyse_document_with_textract_api(
|
@@ -18,7 +18,7 @@ def analyse_document_with_textract_api(
|
|
18 |
s3_input_prefix: str,
|
19 |
s3_output_prefix: str,
|
20 |
job_df:pd.DataFrame,
|
21 |
-
s3_bucket_name: str =
|
22 |
local_output_dir: str = OUTPUT_FOLDER,
|
23 |
analyse_signatures:List[str] = [],
|
24 |
successful_job_number:int=0,
|
@@ -328,7 +328,7 @@ def poll_bulk_textract_analysis_progress_and_download(
|
|
328 |
s3_output_prefix: str,
|
329 |
pdf_filename:str,
|
330 |
job_df:pd.DataFrame,
|
331 |
-
s3_bucket_name: str =
|
332 |
local_output_dir: str = OUTPUT_FOLDER,
|
333 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
334 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
|
|
10 |
from urllib.parse import urlparse
|
11 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
12 |
|
13 |
+
from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
|
14 |
#from tools.aws_textract import json_to_ocrresult
|
15 |
|
16 |
def analyse_document_with_textract_api(
|
|
|
18 |
s3_input_prefix: str,
|
19 |
s3_output_prefix: str,
|
20 |
job_df:pd.DataFrame,
|
21 |
+
s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
|
22 |
local_output_dir: str = OUTPUT_FOLDER,
|
23 |
analyse_signatures:List[str] = [],
|
24 |
successful_job_number:int=0,
|
|
|
328 |
s3_output_prefix: str,
|
329 |
pdf_filename:str,
|
330 |
job_df:pd.DataFrame,
|
331 |
+
s3_bucket_name: str = TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET,
|
332 |
local_output_dir: str = OUTPUT_FOLDER,
|
333 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
334 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|