Commit
·
8953ca0
1
Parent(s):
a56b9b0
Updated Textract logging
Browse files- app.py +9 -9
- load_s3_logs.py +20 -9
- tools/file_redaction.py +26 -28
- tools/helper_functions.py +1 -1
- tools/textract_batch_call.py +7 -4
app.py
CHANGED
|
@@ -201,7 +201,7 @@ with app:
|
|
| 201 |
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
| 202 |
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
|
| 203 |
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
| 204 |
-
is_a_textract_api_call = gr.Checkbox(value=False, label="
|
| 205 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
| 206 |
|
| 207 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
|
@@ -498,7 +498,7 @@ with app:
|
|
| 498 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
| 499 |
|
| 500 |
# Run redaction function
|
| 501 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
|
| 502 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
| 503 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
| 504 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
|
|
@@ -520,7 +520,7 @@ with app:
|
|
| 520 |
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
| 521 |
|
| 522 |
# Send whole document to Textract for text extraction
|
| 523 |
-
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call])
|
| 524 |
|
| 525 |
check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
| 526 |
success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
|
|
@@ -714,20 +714,20 @@ with app:
|
|
| 714 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
| 715 |
|
| 716 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
| 717 |
-
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
|
| 718 |
|
| 719 |
-
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
| 720 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 721 |
|
| 722 |
-
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
| 723 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 724 |
else:
|
| 725 |
-
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
|
| 726 |
|
| 727 |
-
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
| 728 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 729 |
|
| 730 |
-
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
|
| 731 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 732 |
|
| 733 |
if __name__ == "__main__":
|
|
|
|
| 201 |
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
| 202 |
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
|
| 203 |
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
| 204 |
+
is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
|
| 205 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
| 206 |
|
| 207 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
|
|
|
| 498 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
| 499 |
|
| 500 |
# Run redaction function
|
| 501 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
| 502 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
| 503 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
| 504 |
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
|
|
|
|
| 520 |
all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
| 521 |
|
| 522 |
# Send whole document to Textract for text extraction
|
| 523 |
+
send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number, total_pdf_page_count], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call, textract_query_number])
|
| 524 |
|
| 525 |
check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
|
| 526 |
success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
|
|
|
|
| 714 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
| 715 |
|
| 716 |
if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
|
| 717 |
+
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
| 718 |
|
| 719 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
| 720 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 721 |
|
| 722 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
| 723 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 724 |
else:
|
| 725 |
+
usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
|
| 726 |
|
| 727 |
+
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
| 728 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 729 |
|
| 730 |
+
successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], None, preprocess=False).\
|
| 731 |
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
| 732 |
|
| 733 |
if __name__ == "__main__":
|
load_s3_logs.py
CHANGED
|
@@ -2,14 +2,22 @@ import boto3
|
|
| 2 |
import pandas as pd
|
| 3 |
from io import StringIO
|
| 4 |
from datetime import datetime
|
| 5 |
-
from tools.config import DOCUMENT_REDACTION_BUCKET
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
-
# S3 setup
|
| 8 |
-
s3 = boto3.client('s3')
|
| 9 |
bucket_name = DOCUMENT_REDACTION_BUCKET
|
| 10 |
-
prefix = '
|
| 11 |
-
earliest_date = '
|
| 12 |
-
latest_date = '
|
| 13 |
|
| 14 |
# Function to list all files in a folder
|
| 15 |
def list_files_in_s3(bucket, prefix):
|
|
@@ -24,8 +32,8 @@ def is_within_date_range(date_str, start_date, end_date):
|
|
| 24 |
return start_date <= date_obj <= end_date
|
| 25 |
|
| 26 |
# Define the date range
|
| 27 |
-
start_date = datetime.strptime(
|
| 28 |
-
end_date = datetime.strptime(
|
| 29 |
|
| 30 |
# List all subfolders under 'usage/'
|
| 31 |
all_files = list_files_in_s3(bucket_name, prefix)
|
|
@@ -44,7 +52,10 @@ df_list = []
|
|
| 44 |
for log_file in log_files:
|
| 45 |
# Download the file
|
| 46 |
obj = s3.get_object(Bucket=bucket_name, Key=log_file)
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
# Read CSV content into pandas DataFrame
|
| 50 |
try:
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
from io import StringIO
|
| 4 |
from datetime import datetime
|
| 5 |
+
from tools.config import DOCUMENT_REDACTION_BUCKET, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
|
| 6 |
+
|
| 7 |
+
# Combine together log files that can be then used for e.g. dashboarding and financial tracking.
|
| 8 |
+
|
| 9 |
+
# S3 setup. Try to use provided keys (needs S3 permissions), otherwise assume AWS SSO connection
|
| 10 |
+
if AWS_ACCESS_KEY and AWS_SECRET_KEY and AWS_REGION:
|
| 11 |
+
s3 = boto3.client('s3',
|
| 12 |
+
aws_access_key_id=AWS_ACCESS_KEY,
|
| 13 |
+
aws_secret_access_key=AWS_SECRET_KEY,
|
| 14 |
+
region_name=AWS_REGION)
|
| 15 |
+
else: s3 = boto3.client('s3')
|
| 16 |
|
|
|
|
|
|
|
| 17 |
bucket_name = DOCUMENT_REDACTION_BUCKET
|
| 18 |
+
prefix = 'usage/' # 'feedback/' # 'logs/' # Change as needed - top-level folder where logs are stored
|
| 19 |
+
earliest_date = '20250409' # Earliest date of logs folder retrieved
|
| 20 |
+
latest_date = '20250423' # Latest date of logs folder retrieved
|
| 21 |
|
| 22 |
# Function to list all files in a folder
|
| 23 |
def list_files_in_s3(bucket, prefix):
|
|
|
|
| 32 |
return start_date <= date_obj <= end_date
|
| 33 |
|
| 34 |
# Define the date range
|
| 35 |
+
start_date = datetime.strptime(earliest_date, '%Y%m%d') # Replace with your start date
|
| 36 |
+
end_date = datetime.strptime(latest_date, '%Y%m%d') # Replace with your end date
|
| 37 |
|
| 38 |
# List all subfolders under 'usage/'
|
| 39 |
all_files = list_files_in_s3(bucket_name, prefix)
|
|
|
|
| 52 |
for log_file in log_files:
|
| 53 |
# Download the file
|
| 54 |
obj = s3.get_object(Bucket=bucket_name, Key=log_file)
|
| 55 |
+
try:
|
| 56 |
+
csv_content = obj['Body'].read().decode('utf-8')
|
| 57 |
+
except:
|
| 58 |
+
csv_content = obj['Body'].read().decode('latin-1')
|
| 59 |
|
| 60 |
# Read CSV content into pandas DataFrame
|
| 61 |
try:
|
tools/file_redaction.py
CHANGED
|
@@ -99,7 +99,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 99 |
duplication_file_path_outputs:list=[],
|
| 100 |
review_file_path:str="",
|
| 101 |
input_folder:str=INPUT_FOLDER,
|
| 102 |
-
|
| 103 |
ocr_file_path:str="",
|
| 104 |
prepare_images:bool=True,
|
| 105 |
progress=gr.Progress(track_tqdm=True)):
|
|
@@ -148,7 +148,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 148 |
- duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
|
| 149 |
- review_file_path (str, optional): The latest review file path created by the app
|
| 150 |
- input_folder (str, optional): The custom input path, if provided
|
| 151 |
-
-
|
| 152 |
- ocr_file_path (str, optional): The latest ocr file path created by the app
|
| 153 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
| 154 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
|
@@ -160,7 +160,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 160 |
out_message = ""
|
| 161 |
pdf_file_name_with_ext = ""
|
| 162 |
pdf_file_name_without_ext = ""
|
| 163 |
-
|
| 164 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
| 165 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
| 166 |
|
|
@@ -229,7 +229,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 229 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
| 230 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
| 231 |
|
| 232 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path,
|
| 233 |
|
| 234 |
#if first_loop_state == False:
|
| 235 |
# Prepare documents and images as required if they don't already exist
|
|
@@ -292,7 +292,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 292 |
#review_file_path = [x for x in out_file_paths if "review_file" in x]
|
| 293 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
| 294 |
|
| 295 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path,
|
| 296 |
|
| 297 |
# Load/create allow list
|
| 298 |
# If string, assume file path
|
|
@@ -422,7 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 422 |
|
| 423 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
| 424 |
|
| 425 |
-
pymupdf_doc, all_pages_decision_process_table, out_file_paths,
|
| 426 |
pdf_image_file_paths,
|
| 427 |
language,
|
| 428 |
chosen_redact_entities,
|
|
@@ -432,7 +432,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 432 |
page_max,
|
| 433 |
text_extraction_method,
|
| 434 |
handwrite_signature_checkbox,
|
| 435 |
-
|
| 436 |
current_loop_page,
|
| 437 |
page_break_return,
|
| 438 |
annotations_all_pages,
|
|
@@ -453,7 +453,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 453 |
output_folder=output_folder)
|
| 454 |
|
| 455 |
# Save Textract request metadata (if exists)
|
| 456 |
-
|
|
|
|
|
|
|
|
|
|
| 457 |
|
| 458 |
elif text_extraction_method == text_ocr_option:
|
| 459 |
|
|
@@ -541,8 +544,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 541 |
|
| 542 |
annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
|
| 543 |
|
| 544 |
-
|
| 545 |
-
|
| 546 |
# Save the gradio_annotation_boxes to a review csv file
|
| 547 |
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
|
| 548 |
|
|
@@ -575,7 +576,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 575 |
estimated_time_taken_state += time_taken
|
| 576 |
|
| 577 |
# If textract requests made, write to logging file. Alos record number of Textract requests
|
| 578 |
-
if all_textract_request_metadata:
|
| 579 |
all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
|
| 580 |
|
| 581 |
all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
|
|
@@ -587,11 +588,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 587 |
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
| 588 |
log_files_output_paths.append(all_textract_request_metadata_file_path)
|
| 589 |
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
textract_query_number += new_textract_queries
|
| 593 |
-
|
| 594 |
-
#if combined_out_message: out_message = combined_out_message
|
| 595 |
|
| 596 |
# Ensure no duplicated output files
|
| 597 |
log_files_output_paths = sorted(list(set(log_files_output_paths)))
|
|
@@ -601,7 +599,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 601 |
if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
|
| 602 |
else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
|
| 603 |
|
| 604 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path,
|
| 605 |
|
| 606 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
| 607 |
'''
|
|
@@ -1164,7 +1162,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1164 |
page_max:int=999,
|
| 1165 |
text_extraction_method:str=tesseract_ocr_option,
|
| 1166 |
handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
|
| 1167 |
-
|
| 1168 |
current_loop_page:int=0,
|
| 1169 |
page_break_return:bool=False,
|
| 1170 |
annotations_all_pages:List=[],
|
|
@@ -1200,7 +1198,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1200 |
- page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
|
| 1201 |
- text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
|
| 1202 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
| 1203 |
-
-
|
| 1204 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
| 1205 |
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
| 1206 |
- all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
|
|
@@ -1350,7 +1348,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1350 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
| 1351 |
pdf_page_as_bytes = image_buffer.getvalue()
|
| 1352 |
|
| 1353 |
-
text_blocks,
|
| 1354 |
|
| 1355 |
if textract_json_file_path not in log_files_output_paths:
|
| 1356 |
log_files_output_paths.append(textract_json_file_path)
|
|
@@ -1359,9 +1357,9 @@ def redact_image_pdf(file_path:str,
|
|
| 1359 |
except Exception as e:
|
| 1360 |
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
| 1361 |
textract_data = {"pages":[]}
|
| 1362 |
-
|
| 1363 |
|
| 1364 |
-
|
| 1365 |
|
| 1366 |
else:
|
| 1367 |
# Check if the current reported_page_number exists in the loaded JSON
|
|
@@ -1376,7 +1374,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1376 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
| 1377 |
pdf_page_as_bytes = image_buffer.getvalue()
|
| 1378 |
|
| 1379 |
-
text_blocks,
|
| 1380 |
|
| 1381 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
| 1382 |
if "pages" not in textract_data: textract_data["pages"] = []
|
|
@@ -1388,14 +1386,14 @@ def redact_image_pdf(file_path:str,
|
|
| 1388 |
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
| 1389 |
print(out_message)
|
| 1390 |
text_blocks = []
|
| 1391 |
-
|
| 1392 |
|
| 1393 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
| 1394 |
if "pages" not in textract_data: textract_data["pages"] = []
|
| 1395 |
|
| 1396 |
raise Exception(out_message)
|
| 1397 |
|
| 1398 |
-
|
| 1399 |
|
| 1400 |
else:
|
| 1401 |
# If the page exists, retrieve the data
|
|
@@ -1563,7 +1561,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1563 |
|
| 1564 |
current_loop_page += 1
|
| 1565 |
|
| 1566 |
-
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths,
|
| 1567 |
|
| 1568 |
# If it's an image file
|
| 1569 |
if is_pdf(file_path) == False:
|
|
@@ -1599,7 +1597,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1599 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
| 1600 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
| 1601 |
|
| 1602 |
-
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths,
|
| 1603 |
|
| 1604 |
if text_extraction_method == textract_option:
|
| 1605 |
# Write the updated existing textract data back to the JSON file
|
|
@@ -1619,7 +1617,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1619 |
|
| 1620 |
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
|
| 1621 |
|
| 1622 |
-
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths,
|
| 1623 |
|
| 1624 |
|
| 1625 |
###
|
|
|
|
| 99 |
duplication_file_path_outputs:list=[],
|
| 100 |
review_file_path:str="",
|
| 101 |
input_folder:str=INPUT_FOLDER,
|
| 102 |
+
total_textract_query_number:int=0,
|
| 103 |
ocr_file_path:str="",
|
| 104 |
prepare_images:bool=True,
|
| 105 |
progress=gr.Progress(track_tqdm=True)):
|
|
|
|
| 148 |
- duplication_file_outputs (list, optional): List to allow for export to the duplication function page.
|
| 149 |
- review_file_path (str, optional): The latest review file path created by the app
|
| 150 |
- input_folder (str, optional): The custom input path, if provided
|
| 151 |
+
- total_textract_query_number (int, optional): The number of textract queries up until this point.
|
| 152 |
- ocr_file_path (str, optional): The latest ocr file path created by the app
|
| 153 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
| 154 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
|
|
|
| 160 |
out_message = ""
|
| 161 |
pdf_file_name_with_ext = ""
|
| 162 |
pdf_file_name_without_ext = ""
|
| 163 |
+
blank_request_metadata = []
|
| 164 |
all_textract_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
| 165 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
| 166 |
|
|
|
|
| 229 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
| 230 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
| 231 |
|
| 232 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
|
| 233 |
|
| 234 |
#if first_loop_state == False:
|
| 235 |
# Prepare documents and images as required if they don't already exist
|
|
|
|
| 292 |
#review_file_path = [x for x in out_file_paths if "review_file" in x]
|
| 293 |
if review_file_path: review_out_file_paths.append(review_file_path)
|
| 294 |
|
| 295 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
|
| 296 |
|
| 297 |
# Load/create allow list
|
| 298 |
# If string, assume file path
|
|
|
|
| 422 |
|
| 423 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
| 424 |
|
| 425 |
+
pymupdf_doc, all_pages_decision_process_table, out_file_paths, new_textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
| 426 |
pdf_image_file_paths,
|
| 427 |
language,
|
| 428 |
chosen_redact_entities,
|
|
|
|
| 432 |
page_max,
|
| 433 |
text_extraction_method,
|
| 434 |
handwrite_signature_checkbox,
|
| 435 |
+
blank_request_metadata,
|
| 436 |
current_loop_page,
|
| 437 |
page_break_return,
|
| 438 |
annotations_all_pages,
|
|
|
|
| 453 |
output_folder=output_folder)
|
| 454 |
|
| 455 |
# Save Textract request metadata (if exists)
|
| 456 |
+
|
| 457 |
+
if new_textract_request_metadata and isinstance(new_textract_request_metadata, list):
|
| 458 |
+
all_textract_request_metadata.extend(new_textract_request_metadata)
|
| 459 |
+
|
| 460 |
|
| 461 |
elif text_extraction_method == text_ocr_option:
|
| 462 |
|
|
|
|
| 544 |
|
| 545 |
annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
|
| 546 |
|
|
|
|
|
|
|
| 547 |
# Save the gradio_annotation_boxes to a review csv file
|
| 548 |
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
|
| 549 |
|
|
|
|
| 576 |
estimated_time_taken_state += time_taken
|
| 577 |
|
| 578 |
# If textract requests made, write to logging file. Alos record number of Textract requests
|
| 579 |
+
if all_textract_request_metadata and isinstance(all_textract_request_metadata, list):
|
| 580 |
all_request_metadata_str = '\n'.join(all_textract_request_metadata).strip()
|
| 581 |
|
| 582 |
all_textract_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_metadata.txt"
|
|
|
|
| 588 |
if all_textract_request_metadata_file_path not in log_files_output_paths:
|
| 589 |
log_files_output_paths.append(all_textract_request_metadata_file_path)
|
| 590 |
|
| 591 |
+
new_textract_query_numbers = len(all_textract_request_metadata)
|
| 592 |
+
total_textract_query_number += new_textract_query_numbers
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
# Ensure no duplicated output files
|
| 595 |
log_files_output_paths = sorted(list(set(log_files_output_paths)))
|
|
|
|
| 599 |
if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
|
| 600 |
else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
|
| 601 |
|
| 602 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path
|
| 603 |
|
| 604 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page:Page, pikepdf_bbox, type="pikepdf_annot"):
|
| 605 |
'''
|
|
|
|
| 1162 |
page_max:int=999,
|
| 1163 |
text_extraction_method:str=tesseract_ocr_option,
|
| 1164 |
handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
|
| 1165 |
+
textract_request_metadata:list=[],
|
| 1166 |
current_loop_page:int=0,
|
| 1167 |
page_break_return:bool=False,
|
| 1168 |
annotations_all_pages:List=[],
|
|
|
|
| 1198 |
- page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
|
| 1199 |
- text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
|
| 1200 |
- handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
|
| 1201 |
+
- textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
|
| 1202 |
- page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
|
| 1203 |
- annotations_all_pages (List, optional): List of annotations on all pages that is used by the gradio_image_annotation object.
|
| 1204 |
- all_line_level_ocr_results_df (pd.DataFrame, optional): All line level OCR results for the document as a Pandas dataframe,
|
|
|
|
| 1348 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
| 1349 |
pdf_page_as_bytes = image_buffer.getvalue()
|
| 1350 |
|
| 1351 |
+
text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 1352 |
|
| 1353 |
if textract_json_file_path not in log_files_output_paths:
|
| 1354 |
log_files_output_paths.append(textract_json_file_path)
|
|
|
|
| 1357 |
except Exception as e:
|
| 1358 |
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
| 1359 |
textract_data = {"pages":[]}
|
| 1360 |
+
new_textract_request_metadata = "Failed Textract API call"
|
| 1361 |
|
| 1362 |
+
textract_request_metadata.append(new_textract_request_metadata)
|
| 1363 |
|
| 1364 |
else:
|
| 1365 |
# Check if the current reported_page_number exists in the loaded JSON
|
|
|
|
| 1374 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
| 1375 |
pdf_page_as_bytes = image_buffer.getvalue()
|
| 1376 |
|
| 1377 |
+
text_blocks, new_textract_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 1378 |
|
| 1379 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
| 1380 |
if "pages" not in textract_data: textract_data["pages"] = []
|
|
|
|
| 1386 |
out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
|
| 1387 |
print(out_message)
|
| 1388 |
text_blocks = []
|
| 1389 |
+
new_textract_request_metadata = "Failed Textract API call"
|
| 1390 |
|
| 1391 |
# Check if "pages" key exists, if not, initialise it as an empty list
|
| 1392 |
if "pages" not in textract_data: textract_data["pages"] = []
|
| 1393 |
|
| 1394 |
raise Exception(out_message)
|
| 1395 |
|
| 1396 |
+
textract_request_metadata.append(new_textract_request_metadata)
|
| 1397 |
|
| 1398 |
else:
|
| 1399 |
# If the page exists, retrieve the data
|
|
|
|
| 1561 |
|
| 1562 |
current_loop_page += 1
|
| 1563 |
|
| 1564 |
+
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1565 |
|
| 1566 |
# If it's an image file
|
| 1567 |
if is_pdf(file_path) == False:
|
|
|
|
| 1597 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
| 1598 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
| 1599 |
|
| 1600 |
+
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1601 |
|
| 1602 |
if text_extraction_method == textract_option:
|
| 1603 |
# Write the updated existing textract data back to the JSON file
|
|
|
|
| 1617 |
|
| 1618 |
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
|
| 1619 |
|
| 1620 |
+
return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1621 |
|
| 1622 |
|
| 1623 |
###
|
tools/helper_functions.py
CHANGED
|
@@ -31,7 +31,7 @@ def reset_state_vars():
|
|
| 31 |
show_share_button=False,
|
| 32 |
show_remove_button=False,
|
| 33 |
interactive=False
|
| 34 |
-
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False
|
| 35 |
|
| 36 |
def reset_ocr_results_state():
|
| 37 |
return pd.DataFrame(), pd.DataFrame(), []
|
|
|
|
| 31 |
show_share_button=False,
|
| 32 |
show_remove_button=False,
|
| 33 |
interactive=False
|
| 34 |
+
), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False, 0
|
| 35 |
|
| 36 |
def reset_ocr_results_state():
|
| 37 |
return pd.DataFrame(), pd.DataFrame(), []
|
tools/textract_batch_call.py
CHANGED
|
@@ -22,6 +22,7 @@ def analyse_document_with_textract_api(
|
|
| 22 |
local_output_dir: str = OUTPUT_FOLDER,
|
| 23 |
analyse_signatures:List[str] = [],
|
| 24 |
successful_job_number:int=0,
|
|
|
|
| 25 |
general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
|
| 26 |
aws_region: str = AWS_REGION # Optional: specify region if not default
|
| 27 |
):
|
|
@@ -39,6 +40,7 @@ def analyse_document_with_textract_api(
|
|
| 39 |
local_output_dir (str, optional): Local directory to save the downloaded JSON results.
|
| 40 |
analyse_signatures (List[str], optional): Analyse signatures? Default is no.
|
| 41 |
successful_job_number (int): The number of successful jobs that have been submitted in this session.
|
|
|
|
| 42 |
aws_region (str, optional): AWS region name. Defaults to boto3 default region.
|
| 43 |
|
| 44 |
Returns:
|
|
@@ -189,8 +191,9 @@ def analyse_document_with_textract_api(
|
|
| 189 |
raise
|
| 190 |
|
| 191 |
successful_job_number += 1
|
|
|
|
| 192 |
|
| 193 |
-
return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call
|
| 194 |
|
| 195 |
def return_job_status(job_id:str,
|
| 196 |
response:dict,
|
|
@@ -457,9 +460,9 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
|
|
| 457 |
|
| 458 |
try:
|
| 459 |
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
| 460 |
-
print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
|
| 461 |
s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
|
| 462 |
-
print("Download successful.")
|
| 463 |
except ClientError as e:
|
| 464 |
if e.response['Error']['Code'] == '404':
|
| 465 |
print("Log file does not exist in S3.")
|
|
@@ -527,4 +530,4 @@ def check_textract_outputs_exist(textract_output_found_checkbox):
|
|
| 527 |
if textract_output_found_checkbox == True:
|
| 528 |
print("Textract outputs found")
|
| 529 |
return
|
| 530 |
-
else: raise Exception("Relevant
|
|
|
|
| 22 |
local_output_dir: str = OUTPUT_FOLDER,
|
| 23 |
analyse_signatures:List[str] = [],
|
| 24 |
successful_job_number:int=0,
|
| 25 |
+
total_document_page_count:int=1,
|
| 26 |
general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
|
| 27 |
aws_region: str = AWS_REGION # Optional: specify region if not default
|
| 28 |
):
|
|
|
|
| 40 |
local_output_dir (str, optional): Local directory to save the downloaded JSON results.
|
| 41 |
analyse_signatures (List[str], optional): Analyse signatures? Default is no.
|
| 42 |
successful_job_number (int): The number of successful jobs that have been submitted in this session.
|
| 43 |
+
total_document_page_count (int): The number of pages in the document
|
| 44 |
aws_region (str, optional): AWS region name. Defaults to boto3 default region.
|
| 45 |
|
| 46 |
Returns:
|
|
|
|
| 191 |
raise
|
| 192 |
|
| 193 |
successful_job_number += 1
|
| 194 |
+
total_number_of_textract_page_calls = total_document_page_count
|
| 195 |
|
| 196 |
+
return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call, total_number_of_textract_page_calls
|
| 197 |
|
| 198 |
def return_job_status(job_id:str,
|
| 199 |
response:dict,
|
|
|
|
| 460 |
|
| 461 |
try:
|
| 462 |
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
| 463 |
+
#print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
|
| 464 |
s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
|
| 465 |
+
#print("Download successful.")
|
| 466 |
except ClientError as e:
|
| 467 |
if e.response['Error']['Code'] == '404':
|
| 468 |
print("Log file does not exist in S3.")
|
|
|
|
| 530 |
if textract_output_found_checkbox == True:
|
| 531 |
print("Textract outputs found")
|
| 532 |
return
|
| 533 |
+
else: raise Exception("Relevant Textract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")
|