Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Apr 23

Commit

8953ca0

1 Parent(s): a56b9b0

Updated Textract logging

Browse files

Files changed (5) hide show

app.py +9 -9
load_s3_logs.py +20 -9
tools/file_redaction.py +26 -28
tools/helper_functions.py +1 -1
tools/textract_batch_call.py +7 -4

app.py CHANGED Viewed

@@ -201,7 +201,7 @@ with app:
     job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
     textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
     selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
-    is_a_textract_api_call = gr.Checkbox(value=False, label="is_a_textract_api_call", visible=False)
     job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
     textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
@@ -498,7 +498,7 @@ with app:
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     # Run redaction function
-    document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
         success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
         success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
@@ -520,7 +520,7 @@ with app:
     all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Send whole document to Textract for text extraction
-    send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call])
     check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
         success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
@@ -714,20 +714,20 @@ with app:
     usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
-        usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
-        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
-        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     else:
-        usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
-        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
-        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 if __name__ == "__main__":

     job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
     textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
     selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
+    is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
     job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
     textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     # Run redaction function
+    document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
         success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
         success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
                     outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
     all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Send whole document to Textract for text extraction
+    send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number])
     check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
         success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
     usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
+        usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
+        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     else:
+        usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
+        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 if __name__ == "__main__":

load_s3_logs.py CHANGED Viewed

@@ -2,14 +2,22 @@ import boto3
 import pandas as pd
 from io import StringIO
 from datetime import datetime
-from tools.config import DOCUMENT_REDACTION_BUCKET
-# S3 setup
-s3 = boto3.client('s3')
 bucket_name = DOCUMENT_REDACTION_BUCKET
-prefix = 'logs'# 'usage/' # 'feedback/'  # Change as needed - top-level folder where logs are stored
-earliest_date = '20250401' # Earliest date of logs folder retrieved
-latest_date = '20250412' # Latest date of logs folder retrieved
 # Function to list all files in a folder
 def list_files_in_s3(bucket, prefix):
@@ -24,8 +32,8 @@ def is_within_date_range(date_str, start_date, end_date):
     return start_date <= date_obj <= end_date
 # Define the date range
-start_date = datetime.strptime('20250401', '%Y%m%d')  # Replace with your start date
-end_date = datetime.strptime('20250412', '%Y%m%d')    # Replace with your end date
 # List all subfolders under 'usage/'
 all_files = list_files_in_s3(bucket_name, prefix)
@@ -44,7 +52,10 @@ df_list = []
 for log_file in log_files:
     # Download the file
     obj = s3.get_object(Bucket=bucket_name, Key=log_file)
-    csv_content = obj['Body'].read().decode('utf-8')
     # Read CSV content into pandas DataFrame
     try:

 import pandas as pd
 from io import StringIO
 from datetime import datetime
+from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
+# Combine together log files that can be then used for e.g. dashboarding and financial tracking.
+# S3 setup. Try to use provided keys (needs S3 permissions), otherwise assume AWS SSO connection
+if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION:
+    s3 = boto3.client('s3',
+                aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY,
+                region_name=AWS_REGION)
+else: s3 = boto3.client('s3')
 bucket_name = DOCUMENT_REDACTION_BUCKET
+prefix = 'usage/' # 'feedback/' # 'logs/' # Change as needed - top-level folder where logs are stored
+earliest_date = '20250409' # Earliest date of logs folder retrieved
+latest_date = '20250423' # Latest date of logs folder retrieved
 # Function to list all files in a folder
 def list_files_in_s3(bucket, prefix):
     return start_date <= date_obj <= end_date
 # Define the date range
+start_date = datetime.strptime(earliest_date, '%Y%m%d')  # Replace with your start date
+end_date = datetime.strptime(latest_date, '%Y%m%d')    # Replace with your end date
 # List all subfolders under 'usage/'
 all_files = list_files_in_s3(bucket_name, prefix)
 for log_file in log_files:
     # Download the file
     obj = s3.get_object(Bucket=bucket_name, Key=log_file)
+    try:
+        csv_content = obj['Body'].read().decode('utf-8')
+    except:
+        csv_content = obj['Body'].read().decode('latin-1')
     # Read CSV content into pandas DataFrame
     try:

tools/file_redaction.py CHANGED Viewed

@@ -99,7 +99,7 @@ def choose_and_run_redactor(file_paths:List[str],
  duplication_file_path_outputs:list=[],
  review_file_path:str="",
  input_folder:str=INPUT_FOLDER,
- textract_query_number:int=0,
  ocr_file_path:str="",
  prepare_images:bool=True,
  progress=gr.Progress(track_tqdm=True)):
@@ -148,7 +148,7 @@ def choose_and_run_redactor(file_paths:List[str],
     - duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
     - review_file_path (str, optional): The latest review file path created by the app
     - input_folder (str, optional): The custom input path, if provided
-    - textract_query_number (int, optional): The number of textract queries up until this point.
     - ocr_file_path (str, optional): The latest ocr file path created by the app
     - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -160,7 +160,7 @@ def choose_and_run_redactor(file_paths:List[str],
     out_message = ""
     pdf_file_name_with_ext = ""
     pdf_file_name_without_ext = ""
-    request_metadata = ""
     all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     review_out_file_paths = [prepared_pdf_file_paths[0]]
@@ -229,7 +229,7 @@ def choose_and_run_redactor(file_paths:List[str],
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
     #if first_loop_state == False:
     # Prepare documents and images as required if they don't already exist
@@ -292,7 +292,7 @@ def choose_and_run_redactor(file_paths:List[str],
                 #review_file_path = [x for x in out_file_paths if "review_file" in x]
                 if review_file_path: review_out_file_paths.append(review_file_path)
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
     # Load/create allow list
     # If string, assume file path
@@ -422,7 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
-            pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              pdf_image_file_paths,
              language,
              chosen_redact_entities,
@@ -432,7 +432,7 @@ def choose_and_run_redactor(file_paths:List[str],
              page_max,
              text_extraction_method,
              handwrite_signature_checkbox,
-             request_metadata,
              current_loop_page,
              page_break_return,
              annotations_all_pages,
@@ -453,7 +453,10 @@ def choose_and_run_redactor(file_paths:List[str],
              output_folder=output_folder)
             # Save Textract request metadata (if exists)
-            if new_request_metadata: all_textract_request_metadata.append(new_request_metadata)
         elif text_extraction_method == text_ocr_option:
@@ -541,8 +544,6 @@ def choose_and_run_redactor(file_paths:List[str],
             annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
             # Save the gradio_annotation_boxes to a review csv file
             review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
@@ -575,7 +576,7 @@ def choose_and_run_redactor(file_paths:List[str],
             estimated_time_taken_state += time_taken
    # If textract requests made, write to logging file. Alos record number of Textract requests
-    if all_textract_request_metadata:
         all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
         all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
@@ -587,11 +588,8 @@ def choose_and_run_redactor(file_paths:List[str],
         if all_textract_request_metadata_file_path not in log_files_output_paths:
             log_files_output_paths.append(all_textract_request_metadata_file_path)
-        new_textract_queries = len(all_textract_request_metadata)
-        textract_query_number += new_textract_queries
-    #if combined_out_message: out_message = combined_out_message
     # Ensure no duplicated output files
     log_files_output_paths = sorted(list(set(log_files_output_paths)))
@@ -601,7 +599,7 @@ def choose_and_run_redactor(file_paths:List[str],
     if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
     else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
-    return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
     '''
@@ -1164,7 +1162,7 @@ def redact_image_pdf(file_path:str,
                      page_max:int=999,
                      text_extraction_method:str=tesseract_ocr_option,
                      handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
-                     request_metadata:str="",
                      current_loop_page:int=0,
                      page_break_return:bool=False,
                      annotations_all_pages:List=[],
@@ -1200,7 +1198,7 @@ def redact_image_pdf(file_path:str,
     - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
     - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
-    - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
     - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
     - all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
@@ -1350,7 +1348,7 @@ def redact_image_pdf(file_path:str,
                         image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                         pdf_page_as_bytes = image_buffer.getvalue()
-                        text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                         if textract_json_file_path not in log_files_output_paths:
                             log_files_output_paths.append(textract_json_file_path)
@@ -1359,9 +1357,9 @@ def redact_image_pdf(file_path:str,
                     except Exception as e:
                         print("Textract extraction for page", reported_page_number, "failed due to:", e)
                         textract_data = {"pages":[]}
-                        new_request_metadata = "Failed Textract API call"
-                    request_metadata = request_metadata + "\n" + new_request_metadata
                 else:
                     # Check if the current reported_page_number exists in the loaded JSON
@@ -1376,7 +1374,7 @@ def redact_image_pdf(file_path:str,
                             image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                             pdf_page_as_bytes = image_buffer.getvalue()
-                            text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialise it as an empty list
                             if "pages" not in textract_data: textract_data["pages"] = []
@@ -1388,14 +1386,14 @@ def redact_image_pdf(file_path:str,
                             out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
                             print(out_message)
                             text_blocks = []
-                            new_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
                             if "pages" not in textract_data: textract_data["pages"] = []
                             raise Exception(out_message)
-                        request_metadata = request_metadata + "\n" + new_request_metadata
                     else:
                         # If the page exists, retrieve the data
@@ -1563,7 +1561,7 @@ def redact_image_pdf(file_path:str,
                 current_loop_page += 1
-                return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
         # If it's an image file
         if is_pdf(file_path) == False:
@@ -1599,7 +1597,7 @@ def redact_image_pdf(file_path:str,
             all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
             all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
-            return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     if text_extraction_method == textract_option:
         # Write the updated existing textract data back to the JSON file
@@ -1619,7 +1617,7 @@ def redact_image_pdf(file_path:str,
     all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
-    return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 ###

  duplication_file_path_outputs:list=[],
  review_file_path:str="",
  input_folder:str=INPUT_FOLDER,
+ total_textract_query_number:int=0,
  ocr_file_path:str="",
  prepare_images:bool=True,
  progress=gr.Progress(track_tqdm=True)):
     - duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
     - review_file_path (str, optional): The latest review file path created by the app
     - input_folder (str, optional): The custom input path, if provided
+    - total_textract_query_number (int, optional): The number of textract queries up until this point.
     - ocr_file_path (str, optional): The latest ocr file path created by the app
     - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     out_message = ""
     pdf_file_name_with_ext = ""
     pdf_file_name_without_ext = ""
+    blank_request_metadata = []
     all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     review_out_file_paths = [prepared_pdf_file_paths[0]]
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
     #if first_loop_state == False:
     # Prepare documents and images as required if they don't already exist
                 #review_file_path = [x for x in out_file_paths if "review_file" in x]
                 if review_file_path: review_out_file_paths.append(review_file_path)
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
     # Load/create allow list
     # If string, assume file path
             print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
+            pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              pdf_image_file_paths,
              language,
              chosen_redact_entities,
              page_max,
              text_extraction_method,
              handwrite_signature_checkbox,
+             blank_request_metadata,
              current_loop_page,
              page_break_return,
              annotations_all_pages,
              output_folder=output_folder)
             # Save Textract request metadata (if exists)
+            if new_textract_request_metadata and isinstance(new_textract_request_metadata, list):
+                all_textract_request_metadata.extend(new_textract_request_metadata)
         elif text_extraction_method == text_ocr_option:
             annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
             # Save the gradio_annotation_boxes to a review csv file
             review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
             estimated_time_taken_state += time_taken
    # If textract requests made, write to logging file. Alos record number of Textract requests
+    if all_textract_request_metadata and isinstance(all_textract_request_metadata, list):
         all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
         all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
         if all_textract_request_metadata_file_path not in log_files_output_paths:
             log_files_output_paths.append(all_textract_request_metadata_file_path)
+        new_textract_query_numbers = len(all_textract_request_metadata)
+        total_textract_query_number += new_textract_query_numbers
     # Ensure no duplicated output files
     log_files_output_paths = sorted(list(set(log_files_output_paths)))
     if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
     else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
+    return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
     '''
                      page_max:int=999,
                      text_extraction_method:str=tesseract_ocr_option,
                      handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
+                     textract_request_metadata:list=[],
                      current_loop_page:int=0,
                      page_break_return:bool=False,
                      annotations_all_pages:List=[],
     - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
     - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
     - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
+    - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
     - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
     - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
     - all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
                         image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                         pdf_page_as_bytes = image_buffer.getvalue()
+                        text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                         if textract_json_file_path not in log_files_output_paths:
                             log_files_output_paths.append(textract_json_file_path)
                     except Exception as e:
                         print("Textract extraction for page", reported_page_number, "failed due to:", e)
                         textract_data = {"pages":[]}
+                        new_textract_request_metadata = "Failed Textract API call"
+                    textract_request_metadata.append(new_textract_request_metadata)
                 else:
                     # Check if the current reported_page_number exists in the loaded JSON
                             image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                             pdf_page_as_bytes = image_buffer.getvalue()
+                            text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialise it as an empty list
                             if "pages" not in textract_data: textract_data["pages"] = []
                             out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
                             print(out_message)
                             text_blocks = []
+                            new_textract_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
                             if "pages" not in textract_data: textract_data["pages"] = []
                             raise Exception(out_message)
+                        textract_request_metadata.append(new_textract_request_metadata)
                     else:
                         # If the page exists, retrieve the data
                 current_loop_page += 1
+                return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
         # If it's an image file
         if is_pdf(file_path) == False:
             all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
             all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
+            return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     if text_extraction_method == textract_option:
         # Write the updated existing textract data back to the JSON file
     all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
+    return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 ###

tools/helper_functions.py CHANGED Viewed

@@ -31,7 +31,7 @@ def reset_state_vars():
             show_share_button=False,
             show_remove_button=False,
             interactive=False
-        ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False
 def reset_ocr_results_state():
     return pd.DataFrame(), pd.DataFrame(), []

             show_share_button=False,
             show_remove_button=False,
             interactive=False
+        ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0
 def reset_ocr_results_state():
     return pd.DataFrame(), pd.DataFrame(), []

tools/textract_batch_call.py CHANGED Viewed

@@ -22,6 +22,7 @@ def analyse_document_with_textract_api(
     local_output_dir: str = OUTPUT_FOLDER,
     analyse_signatures:List[str] = [],
     successful_job_number:int=0,
     general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
     aws_region: str = AWS_REGION # Optional: specify region if not default
     ):
@@ -39,6 +40,7 @@ def analyse_document_with_textract_api(
         local_output_dir (str, optional): Local directory to save the downloaded JSON results.
         analyse_signatures (List[str], optional): Analyse signatures? Default is no.
         successful_job_number (int): The number of successful jobs that have been submitted in this session.
         aws_region (str, optional): AWS region name. Defaults to boto3 default region.
     Returns:
@@ -189,8 +191,9 @@ def analyse_document_with_textract_api(
         raise
     successful_job_number += 1
-    return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call
 def return_job_status(job_id:str,
                      response:dict,
@@ -457,9 +460,9 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
         try:
             s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
-            print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
             s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
-            print("Download successful.")
         except ClientError as e:
             if e.response['Error']['Code'] == '404':
                 print("Log file does not exist in S3.")
@@ -527,4 +530,4 @@ def check_textract_outputs_exist(textract_output_found_checkbox):
         if textract_output_found_checkbox == True:
             print("Textract outputs found")
             return
-        else: raise Exception("Relevant Tetract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")

     local_output_dir: str = OUTPUT_FOLDER,
     analyse_signatures:List[str] = [],
     successful_job_number:int=0,
+    total_document_page_count:int=1,
     general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
     aws_region: str = AWS_REGION # Optional: specify region if not default
     ):
         local_output_dir (str, optional): Local directory to save the downloaded JSON results.
         analyse_signatures (List[str], optional): Analyse signatures? Default is no.
         successful_job_number (int): The number of successful jobs that have been submitted in this session.
+        total_document_page_count (int): The number of pages in the document
         aws_region (str, optional): AWS region name. Defaults to boto3 default region.
     Returns:
         raise
     successful_job_number += 1
+    total_number_of_textract_page_calls = total_document_page_count
+    return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls
 def return_job_status(job_id:str,
                      response:dict,
         try:
             s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
+            #print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
             s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
+            #print("Download successful.")
         except ClientError as e:
             if e.response['Error']['Code'] == '404':
                 print("Log file does not exist in S3.")
         if textract_output_found_checkbox == True:
             print("Textract outputs found")
             return
+        else: raise Exception("Relevant Textract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")