Commit
·
8953ca0
1
Parent(s):
a56b9b0
Updated Textract logging
Browse files- app.py +9 -9
- load_s3_logs.py +20 -9
- tools/file_redaction.py +26 -28
- tools/helper_functions.py +1 -1
- tools/textract_batch_call.py +7 -4
app.py
CHANGED
@@ -201,7 +201,7 @@ with app:
|
|
201 |
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
202 |
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
|
203 |
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
204 |
-
is_a_textract_api_call = gr.Checkbox(value=False, label="
|
205 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
206 |
|
207 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
@@ -498,7 +498,7 @@ with app:
|
|
498 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
499 |
|
500 |
# Run redaction function
|
501 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
|
502 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
503 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
504 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
|
@@ -520,7 +520,7 @@ with app:
|
|
520 |
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
521 |
|
522 |
# Send whole document to Textract for text extraction
|
523 |
-
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call])
|
524 |
|
525 |
check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
526 |
success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
|
@@ -714,20 +714,20 @@ with app:
|
|
714 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
715 |
|
716 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
717 |
-
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
|
718 |
|
719 |
-
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
720 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
721 |
|
722 |
-
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
723 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
724 |
else:
|
725 |
-
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
|
726 |
|
727 |
-
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
728 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
729 |
|
730 |
-
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
731 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
732 |
|
733 |
if __name__ == "__main__":
|
|
|
201 |
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
202 |
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
|
203 |
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
204 |
+
is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
|
205 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
206 |
|
207 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
|
|
498 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
499 |
|
500 |
# Run redaction function
|
501 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
502 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
503 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
504 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
|
|
|
520 |
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
521 |
|
522 |
# Send whole document to Textract for text extraction
|
523 |
+
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number])
|
524 |
|
525 |
check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
526 |
success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
|
|
|
714 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
715 |
|
716 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
717 |
+
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
718 |
|
719 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
720 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
721 |
|
722 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
723 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
724 |
else:
|
725 |
+
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
726 |
|
727 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
728 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
729 |
|
730 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
731 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
732 |
|
733 |
if __name__ == "__main__":
|
load_s3_logs.py
CHANGED
@@ -2,14 +2,22 @@ import boto3
|
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
from datetime import datetime
|
5 |
-
from tools.config import DOCUMENT_REDACTION_BUCKET
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
-
# S3 setup
|
8 |
-
s3 = boto3.client('s3')
|
9 |
bucket_name = DOCUMENT_REDACTION_BUCKET
|
10 |
-
prefix = '
|
11 |
-
earliest_date = '
|
12 |
-
latest_date = '
|
13 |
|
14 |
# Function to list all files in a folder
|
15 |
def list_files_in_s3(bucket, prefix):
|
@@ -24,8 +32,8 @@ def is_within_date_range(date_str, start_date, end_date):
|
|
24 |
return start_date <= date_obj <= end_date
|
25 |
|
26 |
# Define the date range
|
27 |
-
start_date = datetime.strptime(
|
28 |
-
end_date = datetime.strptime(
|
29 |
|
30 |
# List all subfolders under 'usage/'
|
31 |
all_files = list_files_in_s3(bucket_name, prefix)
|
@@ -44,7 +52,10 @@ df_list = []
|
|
44 |
for log_file in log_files:
|
45 |
# Download the file
|
46 |
obj = s3.get_object(Bucket=bucket_name, Key=log_file)
|
47 |
-
|
|
|
|
|
|
|
48 |
|
49 |
# Read CSV content into pandas DataFrame
|
50 |
try:
|
|
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
from datetime import datetime
|
5 |
+
from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
|
6 |
+
|
7 |
+
# Combine together log files that can be then used for e.g. dashboarding and financial tracking.
|
8 |
+
|
9 |
+
# S3 setup. Try to use provided keys (needs S3 permissions), otherwise assume AWS SSO connection
|
10 |
+
if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION:
|
11 |
+
s3 = boto3.client('s3',
|
12 |
+
aws_access_key_id=AWS_ACCESS_KEY,
|
13 |
+
aws_secret_access_key=AWS_SECRET_KEY,
|
14 |
+
region_name=AWS_REGION)
|
15 |
+
else: s3 = boto3.client('s3')
|
16 |
|
|
|
|
|
17 |
bucket_name = DOCUMENT_REDACTION_BUCKET
|
18 |
+
prefix = 'usage/' # 'feedback/' # 'logs/' # Change as needed - top-level folder where logs are stored
|
19 |
+
earliest_date = '20250409' # Earliest date of logs folder retrieved
|
20 |
+
latest_date = '20250423' # Latest date of logs folder retrieved
|
21 |
|
22 |
# Function to list all files in a folder
|
23 |
def list_files_in_s3(bucket, prefix):
|
|
|
32 |
return start_date <= date_obj <= end_date
|
33 |
|
34 |
# Define the date range
|
35 |
+
start_date = datetime.strptime(earliest_date, '%Y%m%d') # Replace with your start date
|
36 |
+
end_date = datetime.strptime(latest_date, '%Y%m%d') # Replace with your end date
|
37 |
|
38 |
# List all subfolders under 'usage/'
|
39 |
all_files = list_files_in_s3(bucket_name, prefix)
|
|
|
52 |
for log_file in log_files:
|
53 |
# Download the file
|
54 |
obj = s3.get_object(Bucket=bucket_name, Key=log_file)
|
55 |
+
try:
|
56 |
+
csv_content = obj['Body'].read().decode('utf-8')
|
57 |
+
except:
|
58 |
+
csv_content = obj['Body'].read().decode('latin-1')
|
59 |
|
60 |
# Read CSV content into pandas DataFrame
|
61 |
try:
|
tools/file_redaction.py
CHANGED
@@ -99,7 +99,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
99 |
duplication_file_path_outputs:list=[],
|
100 |
review_file_path:str="",
|
101 |
input_folder:str=INPUT_FOLDER,
|
102 |
-
|
103 |
ocr_file_path:str="",
|
104 |
prepare_images:bool=True,
|
105 |
progress=gr.Progress(track_tqdm=True)):
|
@@ -148,7 +148,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
148 |
- duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
|
149 |
- review_file_path (str, optional): The latest review file path created by the app
|
150 |
- input_folder (str, optional): The custom input path, if provided
|
151 |
-
-
|
152 |
- ocr_file_path (str, optional): The latest ocr file path created by the app
|
153 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
154 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
@@ -160,7 +160,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
160 |
out_message = ""
|
161 |
pdf_file_name_with_ext = ""
|
162 |
pdf_file_name_without_ext = ""
|
163 |
-
|
164 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
165 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
166 |
|
@@ -229,7 +229,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
229 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
230 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
231 |
|
232 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path,
|
233 |
|
234 |
#if first_loop_state == False:
|
235 |
# Prepare documents and images as required if they don't already exist
|
@@ -292,7 +292,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
292 |
#review_file_path = [x for x in out_file_paths if "review_file" in x]
|
293 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
294 |
|
295 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path,
|
296 |
|
297 |
# Load/create allow list
|
298 |
# If string, assume file path
|
@@ -422,7 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
422 |
|
423 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
424 |
|
425 |
-
pymupdf_doc, all_pages_decision_process_table, out_file_paths,
|
426 |
pdf_image_file_paths,
|
427 |
language,
|
428 |
chosen_redact_entities,
|
@@ -432,7 +432,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
432 |
page_max,
|
433 |
text_extraction_method,
|
434 |
handwrite_signature_checkbox,
|
435 |
-
|
436 |
current_loop_page,
|
437 |
page_break_return,
|
438 |
annotations_all_pages,
|
@@ -453,7 +453,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
453 |
output_folder=output_folder)
|
454 |
|
455 |
# Save Textract request metadata (if exists)
|
456 |
-
|
|
|
|
|
|
|
457 |
|
458 |
elif text_extraction_method == text_ocr_option:
|
459 |
|
@@ -541,8 +544,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
541 |
|
542 |
annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
|
543 |
|
544 |
-
|
545 |
-
|
546 |
# Save the gradio_annotation_boxes to a review csv file
|
547 |
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
|
548 |
|
@@ -575,7 +576,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
575 |
estimated_time_taken_state += time_taken
|
576 |
|
577 |
# If textract requests made, write to logging file. Alos record number of Textract requests
|
578 |
-
if all_textract_request_metadata:
|
579 |
all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
|
580 |
|
581 |
all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
|
@@ -587,11 +588,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
587 |
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
588 |
log_files_output_paths.append(all_textract_request_metadata_file_path)
|
589 |
|
590 |
-
|
591 |
-
|
592 |
-
textract_query_number += new_textract_queries
|
593 |
-
|
594 |
-
#if combined_out_message: out_message = combined_out_message
|
595 |
|
596 |
# Ensure no duplicated output files
|
597 |
log_files_output_paths = sorted(list(set(log_files_output_paths)))
|
@@ -601,7 +599,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
601 |
if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
|
602 |
else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
|
603 |
|
604 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path,
|
605 |
|
606 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
607 |
'''
|
@@ -1164,7 +1162,7 @@ def redact_image_pdf(file_path:str,
|
|
1164 |
page_max:int=999,
|
1165 |
text_extraction_method:str=tesseract_ocr_option,
|
1166 |
handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
|
1167 |
-
|
1168 |
current_loop_page:int=0,
|
1169 |
page_break_return:bool=False,
|
1170 |
annotations_all_pages:List=[],
|
@@ -1200,7 +1198,7 @@ def redact_image_pdf(file_path:str,
|
|
1200 |
- page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
|
1201 |
- text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
|
1202 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
1203 |
-
-
|
1204 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
1205 |
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
1206 |
- all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
|
@@ -1350,7 +1348,7 @@ def redact_image_pdf(file_path:str,
|
|
1350 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
1351 |
pdf_page_as_bytes = image_buffer.getvalue()
|
1352 |
|
1353 |
-
text_blocks,
|
1354 |
|
1355 |
if textract_json_file_path not in log_files_output_paths:
|
1356 |
log_files_output_paths.append(textract_json_file_path)
|
@@ -1359,9 +1357,9 @@ def redact_image_pdf(file_path:str,
|
|
1359 |
except Exception as e:
|
1360 |
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
1361 |
textract_data = {"pages":[]}
|
1362 |
-
|
1363 |
|
1364 |
-
|
1365 |
|
1366 |
else:
|
1367 |
# Check if the current reported_page_number exists in the loaded JSON
|
@@ -1376,7 +1374,7 @@ def redact_image_pdf(file_path:str,
|
|
1376 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
1377 |
pdf_page_as_bytes = image_buffer.getvalue()
|
1378 |
|
1379 |
-
text_blocks,
|
1380 |
|
1381 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1382 |
if "pages" not in textract_data: textract_data["pages"] = []
|
@@ -1388,14 +1386,14 @@ def redact_image_pdf(file_path:str,
|
|
1388 |
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
1389 |
print(out_message)
|
1390 |
text_blocks = []
|
1391 |
-
|
1392 |
|
1393 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1394 |
if "pages" not in textract_data: textract_data["pages"] = []
|
1395 |
|
1396 |
raise Exception(out_message)
|
1397 |
|
1398 |
-
|
1399 |
|
1400 |
else:
|
1401 |
# If the page exists, retrieve the data
|
@@ -1563,7 +1561,7 @@ def redact_image_pdf(file_path:str,
|
|
1563 |
|
1564 |
current_loop_page += 1
|
1565 |
|
1566 |
-
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths,
|
1567 |
|
1568 |
# If it's an image file
|
1569 |
if is_pdf(file_path) == False:
|
@@ -1599,7 +1597,7 @@ def redact_image_pdf(file_path:str,
|
|
1599 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
1600 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
1601 |
|
1602 |
-
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths,
|
1603 |
|
1604 |
if text_extraction_method == textract_option:
|
1605 |
# Write the updated existing textract data back to the JSON file
|
@@ -1619,7 +1617,7 @@ def redact_image_pdf(file_path:str,
|
|
1619 |
|
1620 |
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
|
1621 |
|
1622 |
-
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths,
|
1623 |
|
1624 |
|
1625 |
###
|
|
|
99 |
duplication_file_path_outputs:list=[],
|
100 |
review_file_path:str="",
|
101 |
input_folder:str=INPUT_FOLDER,
|
102 |
+
total_textract_query_number:int=0,
|
103 |
ocr_file_path:str="",
|
104 |
prepare_images:bool=True,
|
105 |
progress=gr.Progress(track_tqdm=True)):
|
|
|
148 |
- duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
|
149 |
- review_file_path (str, optional): The latest review file path created by the app
|
150 |
- input_folder (str, optional): The custom input path, if provided
|
151 |
+
- total_textract_query_number (int, optional): The number of textract queries up until this point.
|
152 |
- ocr_file_path (str, optional): The latest ocr file path created by the app
|
153 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
154 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
|
|
160 |
out_message = ""
|
161 |
pdf_file_name_with_ext = ""
|
162 |
pdf_file_name_without_ext = ""
|
163 |
+
blank_request_metadata = []
|
164 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
165 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
166 |
|
|
|
229 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
230 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
231 |
|
232 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
|
233 |
|
234 |
#if first_loop_state == False:
|
235 |
# Prepare documents and images as required if they don't already exist
|
|
|
292 |
#review_file_path = [x for x in out_file_paths if "review_file" in x]
|
293 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
294 |
|
295 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
|
296 |
|
297 |
# Load/create allow list
|
298 |
# If string, assume file path
|
|
|
422 |
|
423 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
424 |
|
425 |
+
pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
426 |
pdf_image_file_paths,
|
427 |
language,
|
428 |
chosen_redact_entities,
|
|
|
432 |
page_max,
|
433 |
text_extraction_method,
|
434 |
handwrite_signature_checkbox,
|
435 |
+
blank_request_metadata,
|
436 |
current_loop_page,
|
437 |
page_break_return,
|
438 |
annotations_all_pages,
|
|
|
453 |
output_folder=output_folder)
|
454 |
|
455 |
# Save Textract request metadata (if exists)
|
456 |
+
|
457 |
+
if new_textract_request_metadata and isinstance(new_textract_request_metadata, list):
|
458 |
+
all_textract_request_metadata.extend(new_textract_request_metadata)
|
459 |
+
|
460 |
|
461 |
elif text_extraction_method == text_ocr_option:
|
462 |
|
|
|
544 |
|
545 |
annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
|
546 |
|
|
|
|
|
547 |
# Save the gradio_annotation_boxes to a review csv file
|
548 |
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
|
549 |
|
|
|
576 |
estimated_time_taken_state += time_taken
|
577 |
|
578 |
# If textract requests made, write to logging file. Alos record number of Textract requests
|
579 |
+
if all_textract_request_metadata and isinstance(all_textract_request_metadata, list):
|
580 |
all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
|
581 |
|
582 |
all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
|
|
|
588 |
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
589 |
log_files_output_paths.append(all_textract_request_metadata_file_path)
|
590 |
|
591 |
+
new_textract_query_numbers = len(all_textract_request_metadata)
|
592 |
+
total_textract_query_number += new_textract_query_numbers
|
|
|
|
|
|
|
593 |
|
594 |
# Ensure no duplicated output files
|
595 |
log_files_output_paths = sorted(list(set(log_files_output_paths)))
|
|
|
599 |
if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
|
600 |
else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
|
601 |
|
602 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
|
603 |
|
604 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
605 |
'''
|
|
|
1162 |
page_max:int=999,
|
1163 |
text_extraction_method:str=tesseract_ocr_option,
|
1164 |
handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
|
1165 |
+
textract_request_metadata:list=[],
|
1166 |
current_loop_page:int=0,
|
1167 |
page_break_return:bool=False,
|
1168 |
annotations_all_pages:List=[],
|
|
|
1198 |
- page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
|
1199 |
- text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
|
1200 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
1201 |
+
- textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
|
1202 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
1203 |
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
1204 |
- all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
|
|
|
1348 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
1349 |
pdf_page_as_bytes = image_buffer.getvalue()
|
1350 |
|
1351 |
+
text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1352 |
|
1353 |
if textract_json_file_path not in log_files_output_paths:
|
1354 |
log_files_output_paths.append(textract_json_file_path)
|
|
|
1357 |
except Exception as e:
|
1358 |
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
1359 |
textract_data = {"pages":[]}
|
1360 |
+
new_textract_request_metadata = "Failed Textract API call"
|
1361 |
|
1362 |
+
textract_request_metadata.append(new_textract_request_metadata)
|
1363 |
|
1364 |
else:
|
1365 |
# Check if the current reported_page_number exists in the loaded JSON
|
|
|
1374 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
1375 |
pdf_page_as_bytes = image_buffer.getvalue()
|
1376 |
|
1377 |
+
text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1378 |
|
1379 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1380 |
if "pages" not in textract_data: textract_data["pages"] = []
|
|
|
1386 |
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
1387 |
print(out_message)
|
1388 |
text_blocks = []
|
1389 |
+
new_textract_request_metadata = "Failed Textract API call"
|
1390 |
|
1391 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
1392 |
if "pages" not in textract_data: textract_data["pages"] = []
|
1393 |
|
1394 |
raise Exception(out_message)
|
1395 |
|
1396 |
+
textract_request_metadata.append(new_textract_request_metadata)
|
1397 |
|
1398 |
else:
|
1399 |
# If the page exists, retrieve the data
|
|
|
1561 |
|
1562 |
current_loop_page += 1
|
1563 |
|
1564 |
+
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1565 |
|
1566 |
# If it's an image file
|
1567 |
if is_pdf(file_path) == False:
|
|
|
1597 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
1598 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
1599 |
|
1600 |
+
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1601 |
|
1602 |
if text_extraction_method == textract_option:
|
1603 |
# Write the updated existing textract data back to the JSON file
|
|
|
1617 |
|
1618 |
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
|
1619 |
|
1620 |
+
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1621 |
|
1622 |
|
1623 |
###
|
tools/helper_functions.py
CHANGED
@@ -31,7 +31,7 @@ def reset_state_vars():
|
|
31 |
show_share_button=False,
|
32 |
show_remove_button=False,
|
33 |
interactive=False
|
34 |
-
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False
|
35 |
|
36 |
def reset_ocr_results_state():
|
37 |
return pd.DataFrame(), pd.DataFrame(), []
|
|
|
31 |
show_share_button=False,
|
32 |
show_remove_button=False,
|
33 |
interactive=False
|
34 |
+
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0
|
35 |
|
36 |
def reset_ocr_results_state():
|
37 |
return pd.DataFrame(), pd.DataFrame(), []
|
tools/textract_batch_call.py
CHANGED
@@ -22,6 +22,7 @@ def analyse_document_with_textract_api(
|
|
22 |
local_output_dir: str = OUTPUT_FOLDER,
|
23 |
analyse_signatures:List[str] = [],
|
24 |
successful_job_number:int=0,
|
|
|
25 |
general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
|
26 |
aws_region: str = AWS_REGION # Optional: specify region if not default
|
27 |
):
|
@@ -39,6 +40,7 @@ def analyse_document_with_textract_api(
|
|
39 |
local_output_dir (str, optional): Local directory to save the downloaded JSON results.
|
40 |
analyse_signatures (List[str], optional): Analyse signatures? Default is no.
|
41 |
successful_job_number (int): The number of successful jobs that have been submitted in this session.
|
|
|
42 |
aws_region (str, optional): AWS region name. Defaults to boto3 default region.
|
43 |
|
44 |
Returns:
|
@@ -189,8 +191,9 @@ def analyse_document_with_textract_api(
|
|
189 |
raise
|
190 |
|
191 |
successful_job_number += 1
|
|
|
192 |
|
193 |
-
return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call
|
194 |
|
195 |
def return_job_status(job_id:str,
|
196 |
response:dict,
|
@@ -457,9 +460,9 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
|
|
457 |
|
458 |
try:
|
459 |
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
460 |
-
print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
|
461 |
s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
|
462 |
-
print("Download successful.")
|
463 |
except ClientError as e:
|
464 |
if e.response['Error']['Code'] == '404':
|
465 |
print("Log file does not exist in S3.")
|
@@ -527,4 +530,4 @@ def check_textract_outputs_exist(textract_output_found_checkbox):
|
|
527 |
if textract_output_found_checkbox == True:
|
528 |
print("Textract outputs found")
|
529 |
return
|
530 |
-
else: raise Exception("Relevant
|
|
|
22 |
local_output_dir: str = OUTPUT_FOLDER,
|
23 |
analyse_signatures:List[str] = [],
|
24 |
successful_job_number:int=0,
|
25 |
+
total_document_page_count:int=1,
|
26 |
general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
|
27 |
aws_region: str = AWS_REGION # Optional: specify region if not default
|
28 |
):
|
|
|
40 |
local_output_dir (str, optional): Local directory to save the downloaded JSON results.
|
41 |
analyse_signatures (List[str], optional): Analyse signatures? Default is no.
|
42 |
successful_job_number (int): The number of successful jobs that have been submitted in this session.
|
43 |
+
total_document_page_count (int): The number of pages in the document
|
44 |
aws_region (str, optional): AWS region name. Defaults to boto3 default region.
|
45 |
|
46 |
Returns:
|
|
|
191 |
raise
|
192 |
|
193 |
successful_job_number += 1
|
194 |
+
total_number_of_textract_page_calls = total_document_page_count
|
195 |
|
196 |
+
return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls
|
197 |
|
198 |
def return_job_status(job_id:str,
|
199 |
response:dict,
|
|
|
460 |
|
461 |
try:
|
462 |
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
463 |
+
#print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
|
464 |
s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
|
465 |
+
#print("Download successful.")
|
466 |
except ClientError as e:
|
467 |
if e.response['Error']['Code'] == '404':
|
468 |
print("Log file does not exist in S3.")
|
|
|
530 |
if textract_output_found_checkbox == True:
|
531 |
print("Textract outputs found")
|
532 |
return
|
533 |
+
else: raise Exception("Relevant Textract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")
|