seanpedrickcase commited on
Commit
8953ca0
·
1 Parent(s): a56b9b0

Updated Textract logging

Browse files
app.py CHANGED
@@ -201,7 +201,7 @@ with app:
201
  job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
202
  textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
203
  selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
204
- is_a_textract_api_call = gr.Checkbox(value=False, label="is_a_textract_api_call", visible=False)
205
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
206
 
207
  textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
@@ -498,7 +498,7 @@ with app:
498
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
499
 
500
  # Run redaction function
501
- document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
502
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
503
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
504
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
@@ -520,7 +520,7 @@ with app:
520
  all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
521
 
522
  # Send whole document to Textract for text extraction
523
- send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call])
524
 
525
  check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
526
  success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
@@ -714,20 +714,20 @@ with app:
714
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
715
 
716
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
717
- usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
718
 
719
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
720
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
721
 
722
- successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
723
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
724
  else:
725
- usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
726
 
727
- latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
728
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
729
 
730
- successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
731
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
732
 
733
  if __name__ == "__main__":
 
201
  job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
202
  textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
203
  selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
204
+ is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
205
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
206
 
207
  textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
 
498
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
499
 
500
  # Run redaction function
501
+ document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
502
  success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
503
  success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
504
  outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
 
520
  all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
521
 
522
  # Send whole document to Textract for text extraction
523
+ send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number])
524
 
525
  check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
526
  success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
 
714
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
715
 
716
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
717
+ usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
718
 
719
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
720
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
721
 
722
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
723
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
724
  else:
725
+ usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
726
 
727
+ latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
728
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
729
 
730
+ successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
731
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
732
 
733
  if __name__ == "__main__":
load_s3_logs.py CHANGED
@@ -2,14 +2,22 @@ import boto3
2
  import pandas as pd
3
  from io import StringIO
4
  from datetime import datetime
5
- from tools.config import DOCUMENT_REDACTION_BUCKET
 
 
 
 
 
 
 
 
 
 
6
 
7
- # S3 setup
8
- s3 = boto3.client('s3')
9
  bucket_name = DOCUMENT_REDACTION_BUCKET
10
- prefix = 'logs'# 'usage/' # 'feedback/' # Change as needed - top-level folder where logs are stored
11
- earliest_date = '20250401' # Earliest date of logs folder retrieved
12
- latest_date = '20250412' # Latest date of logs folder retrieved
13
 
14
  # Function to list all files in a folder
15
  def list_files_in_s3(bucket, prefix):
@@ -24,8 +32,8 @@ def is_within_date_range(date_str, start_date, end_date):
24
  return start_date <= date_obj <= end_date
25
 
26
  # Define the date range
27
- start_date = datetime.strptime('20250401', '%Y%m%d') # Replace with your start date
28
- end_date = datetime.strptime('20250412', '%Y%m%d') # Replace with your end date
29
 
30
  # List all subfolders under 'usage/'
31
  all_files = list_files_in_s3(bucket_name, prefix)
@@ -44,7 +52,10 @@ df_list = []
44
  for log_file in log_files:
45
  # Download the file
46
  obj = s3.get_object(Bucket=bucket_name, Key=log_file)
47
- csv_content = obj['Body'].read().decode('utf-8')
 
 
 
48
 
49
  # Read CSV content into pandas DataFrame
50
  try:
 
2
  import pandas as pd
3
  from io import StringIO
4
  from datetime import datetime
5
+ from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
6
+
7
+ # Combine together log files that can be then used for e.g. dashboarding and financial tracking.
8
+
9
+ # S3 setup. Try to use provided keys (needs S3 permissions), otherwise assume AWS SSO connection
10
+ if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION:
11
+ s3 = boto3.client('s3',
12
+ aws_access_key_id=AWS_ACCESS_KEY,
13
+ aws_secret_access_key=AWS_SECRET_KEY,
14
+ region_name=AWS_REGION)
15
+ else: s3 = boto3.client('s3')
16
 
 
 
17
  bucket_name = DOCUMENT_REDACTION_BUCKET
18
+ prefix = 'usage/' # 'feedback/' # 'logs/' # Change as needed - top-level folder where logs are stored
19
+ earliest_date = '20250409' # Earliest date of logs folder retrieved
20
+ latest_date = '20250423' # Latest date of logs folder retrieved
21
 
22
  # Function to list all files in a folder
23
  def list_files_in_s3(bucket, prefix):
 
32
  return start_date <= date_obj <= end_date
33
 
34
  # Define the date range
35
+ start_date = datetime.strptime(earliest_date, '%Y%m%d') # Replace with your start date
36
+ end_date = datetime.strptime(latest_date, '%Y%m%d') # Replace with your end date
37
 
38
  # List all subfolders under 'usage/'
39
  all_files = list_files_in_s3(bucket_name, prefix)
 
52
  for log_file in log_files:
53
  # Download the file
54
  obj = s3.get_object(Bucket=bucket_name, Key=log_file)
55
+ try:
56
+ csv_content = obj['Body'].read().decode('utf-8')
57
+ except:
58
+ csv_content = obj['Body'].read().decode('latin-1')
59
 
60
  # Read CSV content into pandas DataFrame
61
  try:
tools/file_redaction.py CHANGED
@@ -99,7 +99,7 @@ def choose_and_run_redactor(file_paths:List[str],
99
  duplication_file_path_outputs:list=[],
100
  review_file_path:str="",
101
  input_folder:str=INPUT_FOLDER,
102
- textract_query_number:int=0,
103
  ocr_file_path:str="",
104
  prepare_images:bool=True,
105
  progress=gr.Progress(track_tqdm=True)):
@@ -148,7 +148,7 @@ def choose_and_run_redactor(file_paths:List[str],
148
  - duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
149
  - review_file_path (str, optional): The latest review file path created by the app
150
  - input_folder (str, optional): The custom input path, if provided
151
- - textract_query_number (int, optional): The number of textract queries up until this point.
152
  - ocr_file_path (str, optional): The latest ocr file path created by the app
153
  - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
154
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -160,7 +160,7 @@ def choose_and_run_redactor(file_paths:List[str],
160
  out_message = ""
161
  pdf_file_name_with_ext = ""
162
  pdf_file_name_without_ext = ""
163
- request_metadata = ""
164
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
165
  review_out_file_paths = [prepared_pdf_file_paths[0]]
166
 
@@ -229,7 +229,7 @@ def choose_and_run_redactor(file_paths:List[str],
229
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
230
  print("Estimated total processing time:", str(estimate_total_processing_time))
231
 
232
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
233
 
234
  #if first_loop_state == False:
235
  # Prepare documents and images as required if they don't already exist
@@ -292,7 +292,7 @@ def choose_and_run_redactor(file_paths:List[str],
292
  #review_file_path = [x for x in out_file_paths if "review_file" in x]
293
  if review_file_path: review_out_file_paths.append(review_file_path)
294
 
295
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
296
 
297
  # Load/create allow list
298
  # If string, assume file path
@@ -422,7 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
422
 
423
  print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
424
 
425
- pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
426
  pdf_image_file_paths,
427
  language,
428
  chosen_redact_entities,
@@ -432,7 +432,7 @@ def choose_and_run_redactor(file_paths:List[str],
432
  page_max,
433
  text_extraction_method,
434
  handwrite_signature_checkbox,
435
- request_metadata,
436
  current_loop_page,
437
  page_break_return,
438
  annotations_all_pages,
@@ -453,7 +453,10 @@ def choose_and_run_redactor(file_paths:List[str],
453
  output_folder=output_folder)
454
 
455
  # Save Textract request metadata (if exists)
456
- if new_request_metadata: all_textract_request_metadata.append(new_request_metadata)
 
 
 
457
 
458
  elif text_extraction_method == text_ocr_option:
459
 
@@ -541,8 +544,6 @@ def choose_and_run_redactor(file_paths:List[str],
541
 
542
  annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
543
 
544
-
545
-
546
  # Save the gradio_annotation_boxes to a review csv file
547
  review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
548
 
@@ -575,7 +576,7 @@ def choose_and_run_redactor(file_paths:List[str],
575
  estimated_time_taken_state += time_taken
576
 
577
  # If textract requests made, write to logging file. Alos record number of Textract requests
578
- if all_textract_request_metadata:
579
  all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
580
 
581
  all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
@@ -587,11 +588,8 @@ def choose_and_run_redactor(file_paths:List[str],
587
  if all_textract_request_metadata_file_path not in log_files_output_paths:
588
  log_files_output_paths.append(all_textract_request_metadata_file_path)
589
 
590
- new_textract_queries = len(all_textract_request_metadata)
591
-
592
- textract_query_number += new_textract_queries
593
-
594
- #if combined_out_message: out_message = combined_out_message
595
 
596
  # Ensure no duplicated output files
597
  log_files_output_paths = sorted(list(set(log_files_output_paths)))
@@ -601,7 +599,7 @@ def choose_and_run_redactor(file_paths:List[str],
601
  if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
602
  else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
603
 
604
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, textract_query_number, ocr_file_path
605
 
606
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
607
  '''
@@ -1164,7 +1162,7 @@ def redact_image_pdf(file_path:str,
1164
  page_max:int=999,
1165
  text_extraction_method:str=tesseract_ocr_option,
1166
  handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
1167
- request_metadata:str="",
1168
  current_loop_page:int=0,
1169
  page_break_return:bool=False,
1170
  annotations_all_pages:List=[],
@@ -1200,7 +1198,7 @@ def redact_image_pdf(file_path:str,
1200
  - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
1201
  - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
1202
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
1203
- - request_metadata (str, optional): Metadata related to the redaction request. Defaults to an empty string.
1204
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
1205
  - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
1206
  - all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
@@ -1350,7 +1348,7 @@ def redact_image_pdf(file_path:str,
1350
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
1351
  pdf_page_as_bytes = image_buffer.getvalue()
1352
 
1353
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1354
 
1355
  if textract_json_file_path not in log_files_output_paths:
1356
  log_files_output_paths.append(textract_json_file_path)
@@ -1359,9 +1357,9 @@ def redact_image_pdf(file_path:str,
1359
  except Exception as e:
1360
  print("Textract extraction for page", reported_page_number, "failed due to:", e)
1361
  textract_data = {"pages":[]}
1362
- new_request_metadata = "Failed Textract API call"
1363
 
1364
- request_metadata = request_metadata + "\n" + new_request_metadata
1365
 
1366
  else:
1367
  # Check if the current reported_page_number exists in the loaded JSON
@@ -1376,7 +1374,7 @@ def redact_image_pdf(file_path:str,
1376
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
1377
  pdf_page_as_bytes = image_buffer.getvalue()
1378
 
1379
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1380
 
1381
  # Check if "pages" key exists, if not, initialise it as an empty list
1382
  if "pages" not in textract_data: textract_data["pages"] = []
@@ -1388,14 +1386,14 @@ def redact_image_pdf(file_path:str,
1388
  out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
1389
  print(out_message)
1390
  text_blocks = []
1391
- new_request_metadata = "Failed Textract API call"
1392
 
1393
  # Check if "pages" key exists, if not, initialise it as an empty list
1394
  if "pages" not in textract_data: textract_data["pages"] = []
1395
 
1396
  raise Exception(out_message)
1397
 
1398
- request_metadata = request_metadata + "\n" + new_request_metadata
1399
 
1400
  else:
1401
  # If the page exists, retrieve the data
@@ -1563,7 +1561,7 @@ def redact_image_pdf(file_path:str,
1563
 
1564
  current_loop_page += 1
1565
 
1566
- return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1567
 
1568
  # If it's an image file
1569
  if is_pdf(file_path) == False:
@@ -1599,7 +1597,7 @@ def redact_image_pdf(file_path:str,
1599
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1600
  all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
1601
 
1602
- return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1603
 
1604
  if text_extraction_method == textract_option:
1605
  # Write the updated existing textract data back to the JSON file
@@ -1619,7 +1617,7 @@ def redact_image_pdf(file_path:str,
1619
 
1620
  all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
1621
 
1622
- return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1623
 
1624
 
1625
  ###
 
99
  duplication_file_path_outputs:list=[],
100
  review_file_path:str="",
101
  input_folder:str=INPUT_FOLDER,
102
+ total_textract_query_number:int=0,
103
  ocr_file_path:str="",
104
  prepare_images:bool=True,
105
  progress=gr.Progress(track_tqdm=True)):
 
148
  - duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
149
  - review_file_path (str, optional): The latest review file path created by the app
150
  - input_folder (str, optional): The custom input path, if provided
151
+ - total_textract_query_number (int, optional): The number of textract queries up until this point.
152
  - ocr_file_path (str, optional): The latest ocr file path created by the app
153
  - prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
154
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
 
160
  out_message = ""
161
  pdf_file_name_with_ext = ""
162
  pdf_file_name_without_ext = ""
163
+ blank_request_metadata = []
164
  all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
165
  review_out_file_paths = [prepared_pdf_file_paths[0]]
166
 
 
229
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
230
  print("Estimated total processing time:", str(estimate_total_processing_time))
231
 
232
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
233
 
234
  #if first_loop_state == False:
235
  # Prepare documents and images as required if they don't already exist
 
292
  #review_file_path = [x for x in out_file_paths if "review_file" in x]
293
  if review_file_path: review_out_file_paths.append(review_file_path)
294
 
295
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
296
 
297
  # Load/create allow list
298
  # If string, assume file path
 
422
 
423
  print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
424
 
425
+ pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
426
  pdf_image_file_paths,
427
  language,
428
  chosen_redact_entities,
 
432
  page_max,
433
  text_extraction_method,
434
  handwrite_signature_checkbox,
435
+ blank_request_metadata,
436
  current_loop_page,
437
  page_break_return,
438
  annotations_all_pages,
 
453
  output_folder=output_folder)
454
 
455
  # Save Textract request metadata (if exists)
456
+
457
+ if new_textract_request_metadata and isinstance(new_textract_request_metadata, list):
458
+ all_textract_request_metadata.extend(new_textract_request_metadata)
459
+
460
 
461
  elif text_extraction_method == text_ocr_option:
462
 
 
544
 
545
  annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
546
 
 
 
547
  # Save the gradio_annotation_boxes to a review csv file
548
  review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
549
 
 
576
  estimated_time_taken_state += time_taken
577
 
578
  # If textract requests made, write to logging file. Alos record number of Textract requests
579
+ if all_textract_request_metadata and isinstance(all_textract_request_metadata, list):
580
  all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
581
 
582
  all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
 
588
  if all_textract_request_metadata_file_path not in log_files_output_paths:
589
  log_files_output_paths.append(all_textract_request_metadata_file_path)
590
 
591
+ new_textract_query_numbers = len(all_textract_request_metadata)
592
+ total_textract_query_number += new_textract_query_numbers
 
 
 
593
 
594
  # Ensure no duplicated output files
595
  log_files_output_paths = sorted(list(set(log_files_output_paths)))
 
599
  if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
600
  else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
601
 
602
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
603
 
604
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
605
  '''
 
1162
  page_max:int=999,
1163
  text_extraction_method:str=tesseract_ocr_option,
1164
  handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
1165
+ textract_request_metadata:list=[],
1166
  current_loop_page:int=0,
1167
  page_break_return:bool=False,
1168
  annotations_all_pages:List=[],
 
1198
  - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
1199
  - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
1200
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
1201
+ - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
1202
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
1203
  - annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
1204
  - all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
 
1348
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
1349
  pdf_page_as_bytes = image_buffer.getvalue()
1350
 
1351
+ text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1352
 
1353
  if textract_json_file_path not in log_files_output_paths:
1354
  log_files_output_paths.append(textract_json_file_path)
 
1357
  except Exception as e:
1358
  print("Textract extraction for page", reported_page_number, "failed due to:", e)
1359
  textract_data = {"pages":[]}
1360
+ new_textract_request_metadata = "Failed Textract API call"
1361
 
1362
+ textract_request_metadata.append(new_textract_request_metadata)
1363
 
1364
  else:
1365
  # Check if the current reported_page_number exists in the loaded JSON
 
1374
  image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
1375
  pdf_page_as_bytes = image_buffer.getvalue()
1376
 
1377
+ text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1378
 
1379
  # Check if "pages" key exists, if not, initialise it as an empty list
1380
  if "pages" not in textract_data: textract_data["pages"] = []
 
1386
  out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
1387
  print(out_message)
1388
  text_blocks = []
1389
+ new_textract_request_metadata = "Failed Textract API call"
1390
 
1391
  # Check if "pages" key exists, if not, initialise it as an empty list
1392
  if "pages" not in textract_data: textract_data["pages"] = []
1393
 
1394
  raise Exception(out_message)
1395
 
1396
+ textract_request_metadata.append(new_textract_request_metadata)
1397
 
1398
  else:
1399
  # If the page exists, retrieve the data
 
1561
 
1562
  current_loop_page += 1
1563
 
1564
+ return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1565
 
1566
  # If it's an image file
1567
  if is_pdf(file_path) == False:
 
1597
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
1598
  all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
1599
 
1600
+ return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1601
 
1602
  if text_extraction_method == textract_option:
1603
  # Write the updated existing textract data back to the JSON file
 
1617
 
1618
  all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
1619
 
1620
+ return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1621
 
1622
 
1623
  ###
tools/helper_functions.py CHANGED
@@ -31,7 +31,7 @@ def reset_state_vars():
31
  show_share_button=False,
32
  show_remove_button=False,
33
  interactive=False
34
- ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False
35
 
36
  def reset_ocr_results_state():
37
  return pd.DataFrame(), pd.DataFrame(), []
 
31
  show_share_button=False,
32
  show_remove_button=False,
33
  interactive=False
34
+ ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0
35
 
36
  def reset_ocr_results_state():
37
  return pd.DataFrame(), pd.DataFrame(), []
tools/textract_batch_call.py CHANGED
@@ -22,6 +22,7 @@ def analyse_document_with_textract_api(
22
  local_output_dir: str = OUTPUT_FOLDER,
23
  analyse_signatures:List[str] = [],
24
  successful_job_number:int=0,
 
25
  general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
26
  aws_region: str = AWS_REGION # Optional: specify region if not default
27
  ):
@@ -39,6 +40,7 @@ def analyse_document_with_textract_api(
39
  local_output_dir (str, optional): Local directory to save the downloaded JSON results.
40
  analyse_signatures (List[str], optional): Analyse signatures? Default is no.
41
  successful_job_number (int): The number of successful jobs that have been submitted in this session.
 
42
  aws_region (str, optional): AWS region name. Defaults to boto3 default region.
43
 
44
  Returns:
@@ -189,8 +191,9 @@ def analyse_document_with_textract_api(
189
  raise
190
 
191
  successful_job_number += 1
 
192
 
193
- return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call
194
 
195
  def return_job_status(job_id:str,
196
  response:dict,
@@ -457,9 +460,9 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
457
 
458
  try:
459
  s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
460
- print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
461
  s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
462
- print("Download successful.")
463
  except ClientError as e:
464
  if e.response['Error']['Code'] == '404':
465
  print("Log file does not exist in S3.")
@@ -527,4 +530,4 @@ def check_textract_outputs_exist(textract_output_found_checkbox):
527
  if textract_output_found_checkbox == True:
528
  print("Textract outputs found")
529
  return
530
- else: raise Exception("Relevant Tetract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")
 
22
  local_output_dir: str = OUTPUT_FOLDER,
23
  analyse_signatures:List[str] = [],
24
  successful_job_number:int=0,
25
+ total_document_page_count:int=1,
26
  general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
27
  aws_region: str = AWS_REGION # Optional: specify region if not default
28
  ):
 
40
  local_output_dir (str, optional): Local directory to save the downloaded JSON results.
41
  analyse_signatures (List[str], optional): Analyse signatures? Default is no.
42
  successful_job_number (int): The number of successful jobs that have been submitted in this session.
43
+ total_document_page_count (int): The number of pages in the document
44
  aws_region (str, optional): AWS region name. Defaults to boto3 default region.
45
 
46
  Returns:
 
191
  raise
192
 
193
  successful_job_number += 1
194
+ total_number_of_textract_page_calls = total_document_page_count
195
 
196
+ return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls
197
 
198
  def return_job_status(job_id:str,
199
  response:dict,
 
460
 
461
  try:
462
  s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
463
+ #print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
464
  s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
465
+ #print("Download successful.")
466
  except ClientError as e:
467
  if e.response['Error']['Code'] == '404':
468
  print("Log file does not exist in S3.")
 
530
  if textract_output_found_checkbox == True:
531
  print("Textract outputs found")
532
  return
533
+ else: raise Exception("Relevant Textract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")