Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on Apr 15

Commit

4a5cee5

unverified ·

2 Parent(s): 5203951 818efbc

Merge pull request #14 from seanpedrick-case/dev

Browse files

Files changed (11) hide show

Dockerfile +13 -14
app.py +148 -93
requirements.txt +3 -3
tools/aws_functions.py +96 -113
tools/aws_textract.py +61 -9
tools/config.py +113 -27
tools/file_conversion.py +44 -25
tools/file_redaction.py +79 -32
tools/helper_functions.py +66 -15
tools/redaction_review.py +13 -1
tools/textract_batch_call.py +410 -197

Dockerfile CHANGED Viewed

@@ -1,5 +1,5 @@
 # Stage 1: Build dependencies and download models
-FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm AS builder
 # Install system dependencies. Need to specify -y for poppler to get it to install
 RUN apt-get update \
@@ -27,7 +27,7 @@ COPY lambda_entrypoint.py .
 COPY entrypoint.sh .
 # Stage 2: Final runtime image
-FROM public.ecr.aws/docker/library/python:3.11.9-slim-bookworm
 # Define a build argument with a default value
 ARG APP_MODE=gradio
@@ -52,11 +52,7 @@ RUN apt-get update \
 RUN useradd -m -u 1000 user
 # Create required directories
-RUN mkdir -p /home/user/app/output \
-    && mkdir -p /home/user/app/input \
-    && mkdir -p /home/user/app/tld \
-    && mkdir -p /home/user/app/logs \
-    && mkdir -p /home/user/app/config \
     && chown -R user:user /home/user/app
 # Copy installed packages from builder stage
@@ -73,10 +69,11 @@ RUN chmod +x /entrypoint.sh
 # Switch to the "user" user
 USER user
 # Set environmental variables
-ENV HOME=/home/user \
-    PATH=/home/user/.local/bin:$PATH \
-    PYTHONPATH=/home/user/app \
     PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     GRADIO_ALLOW_FLAGGING=never \
@@ -84,15 +81,17 @@ ENV HOME=/home/user \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
     GRADIO_ANALYTICS_ENABLED=False \
-    GRADIO_THEME=huggingface \
-    TLDEXTRACT_CACHE=$HOME/app/tld/.tld_set_snapshot \
     SYSTEM=spaces
 # Set the working directory to the user's home directory
-WORKDIR $HOME/app
 # Copy the app code to the container
-COPY --chown=user . $HOME/app
 ENTRYPOINT [ "/entrypoint.sh" ]

 # Stage 1: Build dependencies and download models
+FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm AS builder
 # Install system dependencies. Need to specify -y for poppler to get it to install
 RUN apt-get update \
 COPY entrypoint.sh .
 # Stage 2: Final runtime image
+FROM public.ecr.aws/docker/library/python:3.11.11-slim-bookworm
 # Define a build argument with a default value
 ARG APP_MODE=gradio
 RUN useradd -m -u 1000 user
 # Create required directories
+RUN mkdir -p /home/user/app/{output,input,tld,logs,usage,feedback,config} \
     && chown -R user:user /home/user/app
 # Copy installed packages from builder stage
 # Switch to the "user" user
 USER user
+ENV APP_HOME=/home/user
 # Set environmental variables
+ENV PATH=$APP_HOME/.local/bin:$PATH \
+    PYTHONPATH=$APP_HOME/app \
     PYTHONUNBUFFERED=1 \
     PYTHONDONTWRITEBYTECODE=1 \
     GRADIO_ALLOW_FLAGGING=never \
     GRADIO_SERVER_NAME=0.0.0.0 \
     GRADIO_SERVER_PORT=7860 \
     GRADIO_ANALYTICS_ENABLED=False \
+    TLDEXTRACT_CACHE=$APP_HOME/app/tld/.tld_set_snapshot \
     SYSTEM=spaces
 # Set the working directory to the user's home directory
+WORKDIR $APP_HOME/app
 # Copy the app code to the container
+COPY --chown=user . $APP_HOME/app
+# Ensure permissions are really user:user again after copying
+RUN chown -R user:user $APP_HOME/app && chmod -R u+rwX $APP_HOME/app
 ENTRYPOINT [ "/entrypoint.sh" ]

app.py CHANGED Viewed

@@ -1,28 +1,25 @@
 import os
 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
-from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
-from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import identify_similar_pages
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
-add_folder_to_path(TESSERACT_FOLDER)
-add_folder_to_path(POPPLER_FOLDER)
-ensure_output_folder_exists(OUTPUT_FOLDER)
 chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
 full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
@@ -58,14 +55,16 @@ with app:
     # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
     pdf_doc_state = gr.State([])
-    all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas", wrap=True)
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
     session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
     s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
     output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
     input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
@@ -133,6 +132,7 @@ with app:
     clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
     prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
     prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
     ## Settings page variables
@@ -147,20 +147,31 @@ with app:
     # S3 settings for default allow list load
     s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
     s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
-    default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=ALLOW_LIST_PATH, visible=False)
     s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
-    default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=COST_CODES_PATH, visible=False)
     enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
     # Base tables that are not modified subsequent to load
     recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
     all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
     cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
-    duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
     # Tracking variables for current page (not visible)
     current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
@@ -168,7 +179,7 @@ with app:
     # Placeholders for elements that may be made visible later below depending on environment variables
     cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
-    cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=False)
     textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
     total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
@@ -177,6 +188,22 @@ with app:
     only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
     ###
     # UI DESIGN
     ###
@@ -199,32 +226,21 @@ with app:
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
-            text_extract_method_radio = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
-            with gr.Row(equal_height=True):
-                pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
             with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
-                handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
-            if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
-                with gr.Accordion("AWS Textract bulk document API call", open = False, visible=True):
-                    with gr.Row(equal_height=True):
-                        job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=True)
-                        send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=True)
-                    with gr.Row(equal_height=True):
-                        check_state_of_textract_api__call_btn = gr.Button("Check state of Textract job", variant="secondary", visible=True)
-                        job_current_status = gr.Textbox(value="", label="job_current_status", visible=True)
-                    with gr.Row(equal_height=True):
-                        textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=True)
             if SHOW_COSTS == "True":
                 with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
-                        with gr.Row(equal_height=True):
-                            textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
-                            total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
-                            estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost ($)", value=0.00, precision=2, visible=True)
-                            estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
             if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
                 with gr.Accordion("Apply cost code", open = True, visible=True):
@@ -232,19 +248,32 @@ with app:
                         cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
                         with gr.Column():
                             reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
-                            cost_code_choice_drop = gr.Dropdown(value="", label="Choose cost code for analysis", choices=[], allow_custom_value=True, visible=True)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
             document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
         with gr.Row():
-            output_summary = gr.Textbox(label="Output summary", scale=1)
             output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
             latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
-        with gr.Row():
-            convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
         # Feedback elements are invisible until revealed by redaction action
         pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
         pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
@@ -263,21 +292,16 @@ with app:
             annotate_zoom_in = gr.Button("Zoom in", visible=False)
             annotate_zoom_out = gr.Button("Zoom out", visible=False)
         with gr.Row():
-            clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
-        with gr.Row(equal_height=True):
             with gr.Column(scale=2):
                 with gr.Row(equal_height=True):
                     annotation_last_page_button = gr.Button("Previous page", scale = 4)
-                    annotate_current_page = gr.Number(value=1, label="Current page", precision=0, scale = 2, min_width=50)
-                    annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
                     annotation_next_page_button = gr.Button("Next page", scale = 4)
-            with gr.Column(scale=1):
-                annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
-        with gr.Row():
-            with gr.Column(scale=2):
                 zoom_str = str(annotator_zoom_number) + '%'
                 annotator = image_annotator(
@@ -297,7 +321,15 @@ with app:
                     handles_cursor=True,
                     interactive=False
                 )
             with gr.Column(scale=1):
                 update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
                 with gr.Accordion("Search suggested redactions", open=True):
                     with gr.Row(equal_height=True):
@@ -318,17 +350,7 @@ with app:
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
-                    reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
-        with gr.Row():
-            with gr.Column(scale=2):
-                with gr.Row(equal_height=True):
-                    annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
-                    annotate_current_page_bottom = gr.Number(value=1, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
-                    annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
-                    annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
-            with gr.Column(scale=1):
-                blank_markdown_bot = gr.Markdown(value="", label="")
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -342,8 +364,8 @@ with app:
         with gr.Accordion("Identify duplicate pages to redact", open = True):
             in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
             with gr.Row():
-                duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale  =1)
-                find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 5)
             duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
@@ -432,7 +454,9 @@ with app:
             all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
             all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
     ### UI INTERACTION ###
     ###
     # PDF/IMAGE REDACTION
@@ -440,7 +464,7 @@ with app:
     # Recalculate estimated costs based on changes to inputs
     if SHOW_COSTS == 'True':
         # Calculate costs
-        total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,         pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
@@ -460,31 +484,42 @@ with app:
         cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
         reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-    success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     # Run redaction function
-    document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, output_summary]).\
-        success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop]).\
-        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # If a file has been completed, the function will continue onto the next document
-    latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
                     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
                     success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     # If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
     all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     ###
     # REVIEW PDF REDACTIONS
@@ -493,7 +528,7 @@ with app:
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Page number controls
@@ -501,11 +536,11 @@ with app:
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
-    annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page])
-    annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page])
-    annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page_bottom, annotate_current_page_bottom])
-    annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page_bottom, annotate_current_page_bottom])
     annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
@@ -548,16 +583,16 @@ with app:
     # Review OCR text buttom
     all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
-    reset_all_ocr_results_btn.click(reset_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
-        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
@@ -601,11 +636,20 @@ with app:
     ###
     # Get connection details on app load
-    app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox])
     # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
-    if GET_DEFAULT_ALLOW_LIST == "True" and ALLOW_LIST_PATH:
-        if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH:
             app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
             success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
             print("Successfully loaded allow list from S3")
@@ -615,20 +659,24 @@ with app:
         else: print("Could not load in default allow list")
     # If relevant environment variable is set, load in the default cost code file from S3 or locally
-    if GET_COST_CODES == "True" and COST_CODES_PATH:
-        if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH:
             app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
-            success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
             print("Successfully loaded cost codes from S3")
         elif os.path.exists(COST_CODES_PATH):
             print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
-            app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
         else: print("Could not load in cost code data")
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
-    access_callback.setup([session_hash_textbox], ACCESS_LOGS_FOLDER)
-    session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
     success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for pdf redactions
@@ -647,16 +695,23 @@ with app:
     usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
-        usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
-        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     else:
-        usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], USAGE_LOGS_FOLDER)
-        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
-if __name__ == "__main__":
     if RUN_DIRECT_MODE == "0":
         if os.environ['COGNITO_AUTH'] == "1":
@@ -667,7 +722,7 @@ if __name__ == "__main__":
     else:
         from tools.cli_redact import main
-        main(first_loop_state, latest_file_completed=0, output_summary="", output_file_list=None,
          log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
          current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])

 import os
+import logging
 import pandas as pd
 import gradio as gr
 from gradio_image_annotation import image_annotator
+from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_BULK_TEXTRACT_CALL_OPTIONS, TEXTRACT_BULK_ANALYSIS_BUCKET, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH
+from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, convert_review_df_to_annotation_json
+from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import identify_similar_pages
+from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
 chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
 full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
     # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
     pdf_doc_state = gr.State([])
+    all_image_annotations_state = gr.State([])
     all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas", wrap=True)
     review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
     session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False)
+    host_name_textbox = gr.Textbox(label= "host_name_textbox", value=HOST_NAME, visible=False)
     s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False)
+    session_output_folder_textbox = gr.Textbox(value = SESSION_OUTPUT_FOLDER, label="session_output_folder_textbox", visible=False)
     output_folder_textbox = gr.Textbox(value = OUTPUT_FOLDER, label="output_folder_textbox", visible=False)
     input_folder_textbox = gr.Textbox(value = INPUT_FOLDER, label="input_folder_textbox", visible=False)
     clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False)
     prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
+    prepare_for_review_bool_false = gr.Checkbox(label="prepare_for_review_bool_false", value=False, visible=False)
     prepare_images_bool_false = gr.Checkbox(label="prepare_images_bool_false", value=False, visible=False)
     ## Settings page variables
     # S3 settings for default allow list load
     s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=DOCUMENT_REDACTION_BUCKET, visible=False)
     s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=S3_ALLOW_LIST_PATH, visible=False)
+    default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=OUTPUT_ALLOW_LIST_PATH, visible=False)
+    s3_bulk_textract_default_bucket = gr.Textbox(label = "Default Textract bulk S3 bucket", value=TEXTRACT_BULK_ANALYSIS_BUCKET, visible=False)
+    s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
+    s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
+    successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
+    load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
+    s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
+    local_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_JOBS_LOCAL_LOC, visible=False)
     s3_default_cost_codes_file = gr.Textbox(label = "Default cost centre file", value=S3_COST_CODES_PATH, visible=False)
+    default_cost_codes_output_folder_location = gr.Textbox(label = "Output default cost centre location", value=OUTPUT_COST_CODES_PATH, visible=False)
     enforce_cost_code_textbox = gr.Textbox(label = "Enforce cost code textbox", value=ENFORCE_COST_CODES, visible=False)
+    default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
     # Base tables that are not modified subsequent to load
     recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
     all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
+    all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
     cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
+    duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas", wrap=True)
     # Tracking variables for current page (not visible)
     current_loop_page_number = gr.Number(value=0,precision=0, interactive=False, label = "Last redacted page in document", visible=False)
     # Placeholders for elements that may be made visible later below depending on environment variables
     cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), type="pandas", visible=False, wrap=True)
+    cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis. Please contact Finance if you can't find your cost code in the given list.", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=False)
     textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=False)
     total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=False)
     only_extract_text_radio = gr.Checkbox(value=False, label="Only extract text (no redaction)", visible=False)
+    # Textract API call placeholders in case option not selected in config
+    job_name_textbox = gr.Textbox(value="", label="Bulk Textract call", visible=False)
+    send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract", variant="primary", visible=False)
+    job_id_textbox = gr.Textbox(label = "Latest job ID for bulk document analysis", value='', visible=False)
+    check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
+    job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
+    job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
+    textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
+    selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
+    is_a_textract_api_call = gr.Checkbox(value=False, label="is_a_textract_api_call", visible=False)
+    job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
+    textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
     ###
     # UI DESIGN
     ###
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
+            text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Go to Redaction settings - AWS Textract options to remove signature detection.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
             with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
+                handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
+            with gr.Row(equal_height=True):
+                pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
             if SHOW_COSTS == "True":
                 with gr.Accordion("Estimated costs and time taken", open = True, visible=True):
+                    with gr.Row(equal_height=True):
+                        textract_output_found_checkbox = gr.Checkbox(value= False, label="Existing Textract output file found", interactive=False, visible=True)
+                        total_pdf_page_count = gr.Number(label = "Total page count", value=0, visible=True)
+                        estimated_aws_costs_number = gr.Number(label = "Approximate AWS Textract and/or Comprehend cost (£)", value=0.00, precision=2, visible=True)
+                        estimated_time_taken_number = gr.Number(label = "Approximate time taken to extract text/redact (minutes)", value=0, visible=True, precision=2)
             if GET_COST_CODES == "True" or ENFORCE_COST_CODES == "True":
                 with gr.Accordion("Apply cost code", open = True, visible=True):
                         cost_code_dataframe = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Existing cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', visible=True, wrap=True, max_height=200)
                         with gr.Column():
                             reset_cost_code_dataframe_button = gr.Button(value="Reset code code table filter")
+                            cost_code_choice_drop = gr.Dropdown(value=DEFAULT_COST_CODE, label="Choose cost code for analysis", choices=[DEFAULT_COST_CODE], allow_custom_value=False, visible=True)
+            if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
+                with gr.Accordion("Submit whole document to AWS Textract API (quicker, max 3,000 pages per document)", open = False, visible=True):
+                    with gr.Row(equal_height=True):
+                        gr.Markdown("""Document will be submitted to AWS Textract API service to extract all text in the document. Processing will take place on (secure) AWS servers, and outputs will be stored on S3 for up to 7 days. To download the results, click 'Check status' below and they will be downloaded if ready.""")
+                    with gr.Row(equal_height=True):
+                        send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract API call", variant="primary", visible=True)
+                    with gr.Row(equal_height=False):
+                        with gr.Column(scale=2):
+                            textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(6,'fixed'), static_columns=[0,1,2,3,4,5])
+                        with gr.Column(scale=1):
+                            job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
+                            check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
+                    with gr.Row():
+                        job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
+                        textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
             document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
         with gr.Row():
+            redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
             output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
             latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
         # Feedback elements are invisible until revealed by redaction action
         pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
         pdf_feedback_radio = gr.Radio(label = "Quality of results", choices=["The results were good", "The results were not good"], visible=False)
             annotate_zoom_in = gr.Button("Zoom in", visible=False)
             annotate_zoom_out = gr.Button("Zoom out", visible=False)
         with gr.Row():
+            clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
+        with gr.Row():
             with gr.Column(scale=2):
                 with gr.Row(equal_height=True):
                     annotation_last_page_button = gr.Button("Previous page", scale = 4)
+                    annotate_current_page = gr.Number(value=0, label="Current page", precision=0, scale = 2, min_width=50)
+                    annotate_max_pages = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
                     annotation_next_page_button = gr.Button("Next page", scale = 4)
                 zoom_str = str(annotator_zoom_number) + '%'
                 annotator = image_annotator(
                     handles_cursor=True,
                     interactive=False
                 )
+                with gr.Row(equal_height=True):
+                    annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
+                    annotate_current_page_bottom = gr.Number(value=0, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
+                    annotate_max_pages_bottom = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
+                    annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
             with gr.Column(scale=1):
+                annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="primary")
                 update_current_page_redactions_btn = gr.Button(value="Save changes on current page to file", variant="primary")
                 with gr.Accordion("Search suggested redactions", open=True):
                     with gr.Row(equal_height=True):
                 with gr.Accordion("Search all extracted text", open=True):
                     all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"),  label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
+                    reset_all_ocr_results_btn = gr.Button(value="Reset OCR output table filter")
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
         with gr.Accordion("Identify duplicate pages to redact", open = True):
             in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
             with gr.Row():
+                duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
+                find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
             duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
             all_output_files_btn = gr.Button("Click here to view all output files", variant="secondary")
             all_output_files = gr.File(label="All files in output folder", file_count='multiple', file_types=['.csv'], interactive=False)
+    ###
     ### UI INTERACTION ###
+    ###
     ###
     # PDF/IMAGE REDACTION
     # Recalculate estimated costs based on changes to inputs
     if SHOW_COSTS == 'True':
         # Calculate costs
+        total_pdf_page_count.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         text_extract_method_radio.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox, pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         pii_identification_method_drop.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         handwrite_signature_checkbox.change(calculate_aws_costs, inputs=[total_pdf_page_count, text_extract_method_radio, handwrite_signature_checkbox,  pii_identification_method_drop, textract_output_found_checkbox, only_extract_text_radio], outputs=[estimated_aws_costs_number])
         cost_code_dataframe.select(df_select_callback_cost, inputs=[cost_code_dataframe], outputs=[cost_code_choice_drop])
         reset_cost_code_dataframe_button.click(reset_base_dataframe, inputs=[cost_code_dataframe_base], outputs=[cost_code_dataframe])
+        cost_code_choice_drop.select(update_cost_code_dataframe_from_dropdown_select, inputs=[cost_code_choice_drop, cost_code_dataframe_base], outputs=[cost_code_dataframe])
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+    success(fn = prepare_image_or_pdf, inputs=[in_doc_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, first_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool_false, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base]).\
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     # Run redaction function
+    document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
+        success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
+        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path], api_name="redact_doc").\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # If a file has been completed, the function will continue onto the next document
+    latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path]).\
                     success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
                     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
                     success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     # If the line level ocr results are changed by load in by user or by a new redaction task, replace the ocr results displayed in the table
     all_line_level_ocr_results_df_base.change(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
+    # Send whole document to Textract for text extraction
+    send_document_to_textract_api_btn.click(analyse_document_with_textract_api, inputs=[prepared_pdf_state, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, handwrite_signature_checkbox, successful_textract_api_call_number], outputs=[job_output_textbox, job_id_textbox, job_type_dropdown, successful_textract_api_call_number, is_a_textract_api_call])
+    check_state_of_textract_api_call_btn.click(check_for_provided_job_id, inputs=[job_id_textbox]).\
+        success(poll_bulk_textract_analysis_progress_and_download, inputs=[job_id_textbox, job_type_dropdown, s3_bulk_textract_output_subfolder, doc_file_name_no_extension_textbox, textract_job_detail_df, s3_bulk_textract_default_bucket, output_folder_textbox, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs = [textract_job_output_file, job_current_status, textract_job_detail_df]).\
+    success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
+    textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     ###
     # REVIEW PDF REDACTIONS
     # Upload previous files for modifying redactions
     upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_base], api_name="prepare_doc").\
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
     # Page number controls
         success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
         success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
+    annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom])
+    annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom])
+    annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom])
+    annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom])
     annotate_current_page_bottom.submit(update_other_annotator_number_from_current, inputs=[annotate_current_page_bottom], outputs=[annotate_current_page])
     # Review OCR text buttom
     all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
+    reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list, total_pdf_page_count]).\
+        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, text_extract_method_radio, latest_file_completed_text, redaction_output_summary_textbox, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox, input_folder_textbox, prepare_images_bool_false], outputs=[redaction_output_summary_textbox, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes, textract_output_found_checkbox, all_img_details_state, all_line_level_ocr_results_df_placeholder]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###
     ###
     # Get connection details on app load
+    if SHOW_BULK_TEXTRACT_CALL_OPTIONS == "True":
+        app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder]).\
+        success(load_in_textract_job_details, inputs=[load_s3_bulk_textract_logs_bool, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[textract_job_detail_df])
+    else:
+        app.load(get_connection_params, inputs=[output_folder_textbox, input_folder_textbox, session_output_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox, input_folder_textbox, s3_bulk_textract_input_subfolder, s3_bulk_textract_output_subfolder, s3_bulk_textract_logs_subfolder, local_bulk_textract_logs_subfolder])
+    # If relevant environment variable is set, load in the Textract job details
     # If relevant environment variable is set, load in the default allow list file from S3 or locally. Even when setting S3 path, need to local path to give a download location
+    if GET_DEFAULT_ALLOW_LIST == "True" and (ALLOW_LIST_PATH or S3_ALLOW_LIST_PATH):
+        if not os.path.exists(ALLOW_LIST_PATH) and S3_ALLOW_LIST_PATH and RUN_AWS_FUNCTIONS == "1":
+            print("Downloading allow list from S3")
             app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
             success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
             print("Successfully loaded allow list from S3")
         else: print("Could not load in default allow list")
     # If relevant environment variable is set, load in the default cost code file from S3 or locally
+    if GET_COST_CODES == "True" and (COST_CODES_PATH or S3_COST_CODES_PATH):
+        if not os.path.exists(COST_CODES_PATH) and S3_COST_CODES_PATH and RUN_AWS_FUNCTIONS == "1":
+            print("Downloading cost codes from S3")
             app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_cost_codes_file, default_cost_codes_output_folder_location]).\
+            success(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
             print("Successfully loaded cost codes from S3")
         elif os.path.exists(COST_CODES_PATH):
             print("Loading cost codes from default cost codes path location:", COST_CODES_PATH)
+            app.load(load_in_default_cost_codes, inputs = [default_cost_codes_output_folder_location, default_cost_code_textbox], outputs=[cost_code_dataframe, cost_code_dataframe_base, cost_code_choice_drop])
         else: print("Could not load in cost code data")
+    ### LOGGING
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
+    access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
+    session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
     success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for pdf redactions
     usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
     if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
+        usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
+        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
+        success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     else:
+        usage_callback.setup([session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], USAGE_LOGS_FOLDER)
+        latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
         success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+        successful_textract_api_call_number.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, blank_doc_file_name_no_extension_textbox_for_logs, data_full_file_name_textbox, actual_time_taken_number, total_pdf_page_count, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox], None, preprocess=False).\
+        success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
+if __name__ == "__main__":
     if RUN_DIRECT_MODE == "0":
         if os.environ['COGNITO_AUTH'] == "1":
     else:
         from tools.cli_redact import main
+        main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
          log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
          current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])

requirements.txt CHANGED Viewed

@@ -7,13 +7,13 @@ presidio_anonymizer==2.2.358
 presidio-image-redactor==0.0.56
 pikepdf==9.5.2
 pandas==2.2.3
-#nltk==3.9.1 # Not required
 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.23.3
-boto3==1.37.17
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1

 presidio-image-redactor==0.0.56
 pikepdf==9.5.2
 pandas==2.2.3
 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+#gradio==5.23.3 # Using latest version of Gradio 5.25.0 below as it fixes the table select issues while filtered
+https://gradio-pypi-previews.s3.amazonaws.com/3e66dcbc9f3b1d106f5488fb1dca51f0787e6d79/gradio-5.25.0-py3-none-any.whl
+boto3==1.37.29
 pyarrow==19.0.1
 openpyxl==3.1.5
 Faker==36.1.1

tools/aws_functions.py CHANGED Viewed

@@ -30,129 +30,101 @@ if RUN_AWS_FUNCTIONS == "1":
         assumed_role_arn, assumed_role_name = get_assumed_role_info()
         print("Successfully assumed ARN role")
-        print("Assumed Role ARN:", assumed_role_arn)
-        print("Assumed Role Name:", assumed_role_name)
     except Exception as e:
         print("Could not get assumed role from STS:", e)
 # Download direct from S3 - requires login credentials
-def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str):
-    s3 = boto3.client('s3', region_name=AWS_REGION)
-    s3.download_file(bucket_name, key, local_file_path_and_name)
-    print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
-def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str):
-    """
-    Download all files from an S3 folder to a local folder.
-    """
-    s3 = boto3.client('s3', region_name=AWS_REGION)
-    # List objects in the specified S3 folder
-    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
-    # Download each object
-    for obj in response.get('Contents', []):
-        # Extract object key and construct local file path
-        object_key = obj['Key']
-        local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
-        # Create directories if necessary
-        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
-        # Download the object
-        try:
-            s3.download_file(bucket_name, object_key, local_file_path)
-            print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
         except Exception as e:
-            print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
-def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str]):
     """
-    Download specific files from an S3 folder to a local folder.
     """
-    s3 = boto3.client('s3', region_name=AWS_REGION)
-    print("Trying to download file: ", filenames)
-    if filenames == '*':
-        # List all objects in the S3 folder
-        print("Trying to download all files in AWS folder: ", s3_folder)
-        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
-        print("Found files in AWS folder: ", response.get('Contents', []))
-        filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
-        print("Found filenames in AWS folder: ", filenames)
-    for filename in filenames:
-        object_key = os.path.join(s3_folder, filename)
-        local_file_path = os.path.join(local_folder, filename)
-        # Create directories if necessary
-        os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
-        # Download the object
-        try:
-            s3.download_file(bucket_name, object_key, local_file_path)
-            print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
-        except Exception as e:
-            print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
-def load_data_from_aws(in_aws_keyword_file, aws_password:str="", bucket_name:str=DOCUMENT_REDACTION_BUCKET):
-    temp_dir = tempfile.mkdtemp()
-    local_address_stub = temp_dir + '/doc-redaction/'
-    files = []
-    if not 'LAMBETH_BOROUGH_PLAN_PASSWORD' in os.environ:
-        out_message = "Can't verify password for dataset access. Do you have a valid AWS connection? Data not loaded."
-        return files, out_message
-    if aws_password:
-        if "Lambeth borough plan" in in_aws_keyword_file and aws_password == os.environ['LAMBETH_BOROUGH_PLAN_PASSWORD']:
-            s3_folder_stub = 'example-data/lambeth-borough-plan/latest/'
-            local_folder_path = local_address_stub
-            # Check if folder exists
-            if not os.path.exists(local_folder_path):
-                print(f"Folder {local_folder_path} does not exist! Making folder.")
-                os.mkdir(local_folder_path)
-            # Check if folder is empty
-            if len(os.listdir(local_folder_path)) == 0:
-                print(f"Folder {local_folder_path} is empty")
-                # Download data
-                download_files_from_s3(bucket_name, s3_folder_stub, local_folder_path, filenames='*')
-                print("AWS data downloaded")
-            else:
-                print(f"Folder {local_folder_path} is not empty")
-            #files = os.listdir(local_folder_stub)
-            #print(files)
-            files = [os.path.join(local_folder_path, f) for f in os.listdir(local_folder_path) if os.path.isfile(os.path.join(local_folder_path, f))]
-            out_message = "Data successfully loaded from AWS"
-            print(out_message)
-        else:
-            out_message = "Data not loaded from AWS"
-            print(out_message)
-    else:
-        out_message = "No password provided. Please ask the data team for access if you need this."
-        print(out_message)
-    return files, out_message
-def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET):
     """
     Uploads a file from local machine to Amazon S3.
@@ -165,33 +137,44 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCU
     - Message as variable/printed to console
     """
     final_out_message = []
-    s3_client = boto3.client('s3', region_name=AWS_REGION)
-    if isinstance(local_file_paths, str):
-        local_file_paths = [local_file_paths]
-    for file in local_file_paths:
-        if s3_client:
-            #print(s3_client)
-            try:
-                # Get file name off file path
-                file_name = os.path.basename(file)
-                s3_key_full = s3_key + file_name
-                print("S3 key: ", s3_key_full)
-                s3_client.upload_file(file, s3_bucket, s3_key_full)
-                out_message = "File " + file_name + " uploaded successfully!"
-                print(out_message)
-            except Exception as e:
-                out_message = f"Error uploading file(s): {e}"
-                print(out_message)
-            final_out_message.append(out_message)
-            final_out_message_str = '\n'.join(final_out_message)
-        else: final_out_message_str = "Could not connect to AWS."
     return final_out_message_str

         assumed_role_arn, assumed_role_name = get_assumed_role_info()
         print("Successfully assumed ARN role")
+        #print("Assumed Role ARN:", assumed_role_arn)
+        #print("Assumed Role Name:", assumed_role_name)
     except Exception as e:
         print("Could not get assumed role from STS:", e)
 # Download direct from S3 - requires login credentials
+def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
+    if RUN_AWS_FUNCTIONS == "1":
+        try:
+            print("bucket_name:", bucket_name)
+            print("key:", key)
+            print("local_file_path_and_name:", local_file_path_and_name)
+            # Ensure the local directory exists
+            os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
+            s3 = boto3.client('s3', region_name=AWS_REGION)
+            s3.download_file(bucket_name, key, local_file_path_and_name)
+            print(f"File downloaded from s3://{bucket_name}/{key} to {local_file_path_and_name}")
         except Exception as e:
+            print("Could not download file:", key, "from s3 due to", e)
+def download_folder_from_s3(bucket_name:str, s3_folder:str, local_folder:str, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
     """
+    Download all files from an S3 folder to a local folder.
     """
+    if RUN_AWS_FUNCTIONS == "1":
+        if bucket_name and s3_folder and local_folder:
+            s3 = boto3.client('s3', region_name=AWS_REGION)
+            # List objects in the specified S3 folder
+            response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+            # Download each object
+            for obj in response.get('Contents', []):
+                # Extract object key and construct local file path
+                object_key = obj['Key']
+                local_file_path = os.path.join(local_folder, os.path.relpath(object_key, s3_folder))
+                # Create directories if necessary
+                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+                # Download the object
+                try:
+                    s3.download_file(bucket_name, object_key, local_file_path)
+                    print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
+                except Exception as e:
+                    print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+        else: print("One or more required variables are empty, could not download from S3")
+def download_files_from_s3(bucket_name:str, s3_folder:str, local_folder:str, filenames:List[str], RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
+    """
+    Download specific files from an S3 folder to a local folder.
+    """
+    if RUN_AWS_FUNCTIONS == "1":
+        if bucket_name and s3_folder and local_folder and filenames:
+            s3 = boto3.client('s3', region_name=AWS_REGION)
+            print("Trying to download file: ", filenames)
+            if filenames == '*':
+                # List all objects in the S3 folder
+                print("Trying to download all files in AWS folder: ", s3_folder)
+                response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_folder)
+                print("Found files in AWS folder: ", response.get('Contents', []))
+                filenames = [obj['Key'].split('/')[-1] for obj in response.get('Contents', [])]
+                print("Found filenames in AWS folder: ", filenames)
+            for filename in filenames:
+                object_key = os.path.join(s3_folder, filename)
+                local_file_path = os.path.join(local_folder, filename)
+                # Create directories if necessary
+                os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
+                # Download the object
+                try:
+                    s3.download_file(bucket_name, object_key, local_file_path)
+                    print(f"Downloaded 's3://{bucket_name}/{object_key}' to '{local_file_path}'")
+                except Exception as e:
+                    print(f"Error downloading 's3://{bucket_name}/{object_key}':", e)
+        else: print("One or more required variables are empty, could not download from S3")
+def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=DOCUMENT_REDACTION_BUCKET, RUN_AWS_FUNCTIONS:str = RUN_AWS_FUNCTIONS):
     """
     Uploads a file from local machine to Amazon S3.
     - Message as variable/printed to console
     """
     final_out_message = []
+    final_out_message_str = ""
+    if RUN_AWS_FUNCTIONS == "1":
+        try:
+            if s3_bucket and s3_key and local_file_paths:
+                s3_client = boto3.client('s3', region_name=AWS_REGION)
+                if isinstance(local_file_paths, str):
+                    local_file_paths = [local_file_paths]
+                for file in local_file_paths:
+                    if s3_client:
+                        #print(s3_client)
+                        try:
+                            # Get file name off file path
+                            file_name = os.path.basename(file)
+                            s3_key_full = s3_key + file_name
+                            print("S3 key: ", s3_key_full)
+                            s3_client.upload_file(file, s3_bucket, s3_key_full)
+                            out_message = "File " + file_name + " uploaded successfully!"
+                            print(out_message)
+                        except Exception as e:
+                            out_message = f"Error uploading file(s): {e}"
+                            print(out_message)
+                        final_out_message.append(out_message)
+                        final_out_message_str = '\n'.join(final_out_message)
+                    else: final_out_message_str = "Could not connect to AWS."
+            else: final_out_message_str = "At least one essential variable is empty, could not upload to S3"
+        except Exception as e:
+            final_out_message_str = "Could not upload files to S3 due to: " + str(e)
+            print(final_out_message_str)
+    else:
+        final_out_message_str = "App not set to run AWS functions"
     return final_out_message_str

tools/aws_textract.py CHANGED Viewed

@@ -6,6 +6,7 @@ import json
 from collections import defaultdict
 import pikepdf
 import time
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
@@ -38,12 +39,10 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
             else:
                 client = boto3.client('textract', region_name=AWS_REGION)
         except:
-            print("Cannot connect to AWS Textract")
             return [], ""  # Return an empty list and an empty string
-    #print("Analysing page with AWS Textract")
-    #print("pdf_page_bytes:", pdf_page_bytes)
-    #print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
     # Redact signatures if specified
     if "Redact all identified signatures" in handwrite_signature_checkbox:
@@ -137,6 +136,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
     # This is a new page
     elif "page_no" in page_json_data:
         text_blocks = page_json_data["data"]["Blocks"]
     is_signature = False
     is_handwriting = False
@@ -275,7 +275,7 @@ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_
     return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
-def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
     """
     Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
     """
@@ -307,7 +307,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
         print("Need to convert Textract JSON to app format.")
         try:
-            textract_data = restructure_textract_output(textract_data)
             return textract_data, False, log_files_output_paths  # Successfully converted
         except Exception as e:
@@ -318,7 +318,7 @@ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output
         print("textract data:", textract_data)
         return {}, True, log_files_output_paths  # Return empty data if JSON is not recognized
-def restructure_textract_output(textract_output: dict):
     """
     Reorganise Textract output from the bulk Textract analysis option on AWS
     into a format that works in this redaction app, reducing size.
@@ -328,10 +328,62 @@ def restructure_textract_output(textract_output: dict):
     # Extract total pages from DocumentMetadata
     document_metadata = textract_output.get("DocumentMetadata", {})
     for block in textract_output.get("Blocks", []):
         page_no = block.get("Page", 1)  # Default to 1 if missing
-        # Initialize page structure if not already present
         if page_no not in pages_dict:
             pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}

 from collections import defaultdict
 import pikepdf
 import time
+import pandas as pd
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION
             else:
                 client = boto3.client('textract', region_name=AWS_REGION)
         except:
+            out_message = "Cannot connect to AWS Textract"
+            print(out_message)
+            raise Exception(out_message)
             return [], ""  # Return an empty list and an empty string
     # Redact signatures if specified
     if "Redact all identified signatures" in handwrite_signature_checkbox:
     # This is a new page
     elif "page_no" in page_json_data:
         text_blocks = page_json_data["data"]["Blocks"]
+    else: text_blocks = []
     is_signature = False
     is_handwriting = False
     return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
+def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str, page_sizes_df:pd.DataFrame):
     """
     Loads Textract JSON from a file, detects if conversion is needed, and converts if necessary.
     """
         print("Need to convert Textract JSON to app format.")
         try:
+            textract_data = restructure_textract_output(textract_data, page_sizes_df)
             return textract_data, False, log_files_output_paths  # Successfully converted
         except Exception as e:
         print("textract data:", textract_data)
         return {}, True, log_files_output_paths  # Return empty data if JSON is not recognized
+def restructure_textract_output(textract_output: dict, page_sizes_df:pd.DataFrame):
     """
     Reorganise Textract output from the bulk Textract analysis option on AWS
     into a format that works in this redaction app, reducing size.
     # Extract total pages from DocumentMetadata
     document_metadata = textract_output.get("DocumentMetadata", {})
+    # For efficient lookup, set 'page' as index if it's not already
+    if 'page' in page_sizes_df.columns:
+        page_sizes_df = page_sizes_df.set_index('page')
     for block in textract_output.get("Blocks", []):
         page_no = block.get("Page", 1)  # Default to 1 if missing
+        # --- Geometry Conversion Logic ---
+        try:
+            page_info = page_sizes_df.loc[page_no]
+            cb_width = page_info['cropbox_width']
+            cb_height = page_info['cropbox_height']
+            mb_width = page_info['mediabox_width']
+            mb_height = page_info['mediabox_height']
+            cb_x_offset = page_info['cropbox_x_offset']
+            cb_y_offset_top = page_info['cropbox_y_offset_from_top']
+            # Check if conversion is needed (and avoid division by zero)
+            needs_conversion = (
+                abs(cb_width - mb_width) > 1e-6 or \
+                abs(cb_height - mb_height) > 1e-6
+            ) and mb_width > 1e-6 and mb_height > 1e-6 # Avoid division by zero
+            if needs_conversion and 'Geometry' in block:
+                geometry = block['Geometry'] # Work directly on the block's geometry
+                # --- Convert BoundingBox ---
+                if 'BoundingBox' in geometry:
+                    bbox = geometry['BoundingBox']
+                    old_left = bbox['Left']
+                    old_top = bbox['Top']
+                    old_width = bbox['Width']
+                    old_height = bbox['Height']
+                    # Calculate absolute coordinates within CropBox
+                    abs_cb_x = old_left * cb_width
+                    abs_cb_y = old_top * cb_height
+                    abs_cb_width = old_width * cb_width
+                    abs_cb_height = old_height * cb_height
+                    # Calculate absolute coordinates relative to MediaBox top-left
+                    abs_mb_x = cb_x_offset + abs_cb_x
+                    abs_mb_y = cb_y_offset_top + abs_cb_y
+                    # Convert back to normalized coordinates relative to MediaBox
+                    bbox['Left'] = abs_mb_x / mb_width
+                    bbox['Top'] = abs_mb_y / mb_height
+                    bbox['Width'] = abs_cb_width / mb_width
+                    bbox['Height'] = abs_cb_height / mb_height
+        except KeyError:
+            print(f"Warning: Page number {page_no} not found in page_sizes_df. Skipping coordinate conversion for this block.")
+            # Decide how to handle missing page info: skip conversion, raise error, etc.
+        except ZeroDivisionError:
+             print(f"Warning: MediaBox width or height is zero for page {page_no}. Skipping coordinate conversion for this block.")
+        # Initialise page structure if not already present
         if page_no not in pages_dict:
             pages_dict[page_no] = {"page_no": str(page_no), "data": {"Blocks": []}}

tools/config.py CHANGED Viewed

@@ -1,12 +1,13 @@
 import os
 import tempfile
 import socket
 from datetime import datetime
 from dotenv import load_dotenv
 from tldextract import TLDExtract
 today_rev = datetime.now().strftime("%Y%m%d")
-host_name = socket.gethostname()
 # Set or retrieve configuration variables for the redaction app
@@ -27,29 +28,71 @@ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False)
     return value
-# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
-APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '')
-if os.path.exists(APP_CONFIG_PATH):
-    print(f"Loading APP variables from config file {APP_CONFIG_PATH}")
-    load_dotenv(APP_CONFIG_PATH)
 ###
 # AWS CONFIG
 ###
-# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
-AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '')
-if os.path.exists(AWS_CONFIG_PATH):
-    print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
-    load_dotenv(AWS_CONFIG_PATH)
 RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
-AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
@@ -65,14 +108,28 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
 DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
 # Custom headers e.g. if routing traffic through Cloudfront
 # Retrieving or setting CUSTOM_HEADER
 CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
-if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
 # Retrieving or setting CUSTOM_HEADER_VALUE
 CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
-if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
 ###
 # Images config
@@ -84,12 +141,14 @@ MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to No
 ###
 # File I/O config
 ###
-SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
 OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
 INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
 # Allow for files to be saved in a temporary folder for increased security in some instances
 if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
@@ -99,22 +158,39 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
         if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
         if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
-FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + today_rev + '/' + host_name + '/')
-USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'logs/' + today_rev + '/' + host_name + '/')
-ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'usage/' + today_rev + '/' + host_name + '/')
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 ###
 # REDACTION CONFIG
-###
-TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "tesseract/")
-POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "poppler/poppler-24.02.0/Library/bin/")
-SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
 PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
@@ -130,7 +206,10 @@ REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Current
 ###
 TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
-extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
 # Get some environment variables and Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
@@ -153,15 +232,22 @@ ALLOW_LIST_PATH = get_or_create_env_var('ALLOW_LIST_PATH', '') # config/default_
 S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
-SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'True')
 GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
 COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
 S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
 ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
-if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
-if GET_COST_CODES == 'True': ENFORCE_COST_CODES = 'False'

 import os
 import tempfile
 import socket
+import logging
 from datetime import datetime
 from dotenv import load_dotenv
 from tldextract import TLDExtract
 today_rev = datetime.now().strftime("%Y%m%d")
+HOST_NAME = socket.gethostname()
 # Set or retrieve configuration variables for the redaction app
     return value
+def ensure_folder_exists(output_folder:str):
+    """Checks if the specified folder exists, creates it if not."""
+    if not os.path.exists(output_folder):
+        # Create the folder if it doesn't exist
+        os.makedirs(output_folder, exist_ok=True)
+        print(f"Created the {output_folder} folder.")
+    else:
+        print(f"The {output_folder} folder already exists.")
+def add_folder_to_path(folder_path: str):
+    '''
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
+    '''
+    if os.path.exists(folder_path) and os.path.isdir(folder_path):
+        print(folder_path, "folder exists.")
+        # Resolve relative path to absolute path
+        absolute_path = os.path.abspath(folder_path)
+        current_path = os.environ['PATH']
+        if absolute_path not in current_path.split(os.pathsep):
+            full_path_extension = absolute_path + os.pathsep + current_path
+            os.environ['PATH'] = full_path_extension
+            #print(f"Updated PATH with: ", full_path_extension)
+        else:
+            print(f"Directory {folder_path} already exists in PATH.")
+    else:
+        print(f"Folder not found at {folder_path} - not added to PATH")
+ensure_folder_exists("config/")
+# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
+APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', 'config/app_config.env') # e.g. config/app_config.env
+if APP_CONFIG_PATH:
+    if os.path.exists(APP_CONFIG_PATH):
+        print(f"Loading app variables from config file {APP_CONFIG_PATH}")
+        load_dotenv(APP_CONFIG_PATH)
+    else: print("App config file not found at location:", APP_CONFIG_PATH)
+# Report logging to console?
+LOGGING = get_or_create_env_var('LOGGING', 'False')
+if LOGGING == 'True':
+    # Configure logging
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 ###
 # AWS CONFIG
 ###
+# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
+AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '') # e.g. config/aws_config.env
+if AWS_CONFIG_PATH:
+    if os.path.exists(AWS_CONFIG_PATH):
+        print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
+        load_dotenv(AWS_CONFIG_PATH)
+    else: print("AWS config file not found at location:", AWS_CONFIG_PATH)
 RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
+AWS_REGION = get_or_create_env_var('AWS_REGION', '')
 AWS_CLIENT_ID = get_or_create_env_var('AWS_CLIENT_ID', '')
 DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
+SHOW_BULK_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_BULK_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
+TEXTRACT_BULK_ANALYSIS_BUCKET = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_BUCKET', '')
+TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER', 'input')
+TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER = get_or_create_env_var('TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER', 'output')
+LOAD_PREVIOUS_TEXTRACT_JOBS_S3 = get_or_create_env_var('LOAD_PREVIOUS_TEXTRACT_JOBS_S3', 'False') # Whether or not to load previous Textract jobs from S3
+TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
+TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
 # Custom headers e.g. if routing traffic through Cloudfront
 # Retrieving or setting CUSTOM_HEADER
 CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
+#if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
 # Retrieving or setting CUSTOM_HEADER_VALUE
 CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
+#if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
 ###
 # Images config
 ###
 # File I/O config
 ###
+SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
 OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
 INPUT_FOLDER = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/') # 'input/'
+ensure_folder_exists(OUTPUT_FOLDER)
+ensure_folder_exists(INPUT_FOLDER)
 # Allow for files to be saved in a temporary folder for increased security in some instances
 if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
     # Create a temporary directory
         if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
         if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
+# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
+# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
+USE_LOG_SUBFOLDERS = get_or_create_env_var('USE_LOG_SUBFOLDERS', 'True')
+if USE_LOG_SUBFOLDERS == "True":
+    day_log_subfolder = today_rev + '/'
+    host_name_subfolder = HOST_NAME + '/'
+    full_log_subfolder = day_log_subfolder + host_name_subfolder
+else:
+    full_log_subfolder = ""
+FEEDBACK_LOGS_FOLDER = get_or_create_env_var('FEEDBACK_LOGS_FOLDER', 'feedback/' + full_log_subfolder)
+ACCESS_LOGS_FOLDER = get_or_create_env_var('ACCESS_LOGS_FOLDER', 'logs/' + full_log_subfolder)
+USAGE_LOGS_FOLDER = get_or_create_env_var('USAGE_LOGS_FOLDER', 'usage/' + full_log_subfolder)
+ensure_folder_exists(FEEDBACK_LOGS_FOLDER)
+ensure_folder_exists(ACCESS_LOGS_FOLDER)
+ensure_folder_exists(USAGE_LOGS_FOLDER)
+# Should the redacted file name be included in the logs? In some instances, the names of the files themselves could be sensitive, and should not be disclosed beyond the app. So, by default this is false.
 DISPLAY_FILE_NAMES_IN_LOGS = get_or_create_env_var('DISPLAY_FILE_NAMES_IN_LOGS', 'False')
 ###
 # REDACTION CONFIG
+# Create Tesseract and Poppler folders if you have installed them locally
+TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
+POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # e.g. poppler/poppler-24.02.0/Library/bin/
+if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
+if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
 # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
 PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
 ###
 TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
+try:
+    extract = TLDExtract(cache_dir=TLDEXTRACT_CACHE)
+except:
+    extract = TLDExtract(cache_dir=None)
 # Get some environment variables and Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_allow_list.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
+else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
+SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
 GET_COST_CODES = get_or_create_env_var('GET_COST_CODES', 'False')
+DEFAULT_COST_CODE = get_or_create_env_var('DEFAULT_COST_CODE', '')
 COST_CODES_PATH = get_or_create_env_var('COST_CODES_PATH', '') # 'config/COST_CENTRES.csv' # file should be a csv file with a single table in it that has two columns with a header. First column should contain cost codes, second column should contain a name or description for the cost code
 S3_COST_CODES_PATH = get_or_create_env_var('S3_COST_CODES_PATH', '') # COST_CENTRES.csv # This is a path within the DOCUMENT_REDACTION_BUCKET
+if COST_CODES_PATH: OUTPUT_COST_CODES_PATH = COST_CODES_PATH
+else: OUTPUT_COST_CODES_PATH = 'config/COST_CENTRES.csv'
 ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If you have cost codes listed, is it compulsory to choose one before redacting?
+if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'

tools/file_conversion.py CHANGED Viewed

@@ -181,7 +181,7 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
     widths = [result[2] for result in results]
     heights = [result[3] for result in results]
-    print("PDF has been converted to images.")
     return images, widths, heights, results
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
@@ -208,7 +208,7 @@ def process_file_for_image_creation(file_path:str, prepare_for_review:bool=False
     # Check if the file is a PDF
     elif file_extension == '.pdf':
-        print(f"{file_path} is a PDF file. Converting to image set")
         # Run your function for processing PDF files here
         img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
@@ -417,12 +417,29 @@ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float]
         pymupdf_page = pymupdf_doc.load_page(page_no)
         original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
-        # Create a page_sizes_object.
-        # If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
         if image_sizes_width and image_sizes_height:
-            out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
-        else:
-            out_page_image_sizes = {"page":reported_page_no, "image_path":image_file_paths[page_no], "image_width":pd.NA, "image_height":pd.NA, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height, "original_cropbox":original_cropboxes[-1]}
         page_sizes.append(out_page_image_sizes)
@@ -434,7 +451,7 @@ def prepare_image_or_pdf(
     latest_file_completed: int = 0,
     out_message: List[str] = [],
     first_loop_state: bool = False,
-    number_of_pages:int = 1,
     all_annotations_object:List = [],
     prepare_for_review:bool = False,
     in_fully_redacted_list:List[int]=[],
@@ -481,6 +498,9 @@ def prepare_image_or_pdf(
     all_img_details = []
     review_file_csv = pd.DataFrame()
     all_line_level_ocr_results_df = pd.DataFrame()
     if isinstance(in_fully_redacted_list, pd.DataFrame):
         if not in_fully_redacted_list.empty:
@@ -494,7 +514,7 @@ def prepare_image_or_pdf(
     else:
         print("Now redacting file", str(latest_file_completed))
-    # If out message or converted_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str): out_message = [out_message]
     if not file_paths: file_paths = []
@@ -521,15 +541,9 @@ def prepare_image_or_pdf(
         file_paths_list = [file_paths]
         file_paths_loop = file_paths_list
     else:
-        if prepare_for_review == False:
-            file_paths_list = file_paths
-            file_paths_loop = [file_paths_list[int(latest_file_completed)]]
-        else:
-            file_paths_list = file_paths
-            file_paths_loop = file_paths
-             # Sort files to prioritise PDF files first, then JSON files. This means that the pdf can be loaded in, and pdf page path locations can be added to the json
-            file_paths_loop = sorted(file_paths_loop, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
     # Loop through files to load in
     for file in file_paths_loop:
         converted_file_path = []
@@ -592,7 +606,6 @@ def prepare_image_or_pdf(
             image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
             # Create a page_sizes_object
             page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
@@ -612,7 +625,8 @@ def prepare_image_or_pdf(
                 json_from_csv = False
         # NEW IF STATEMENT
-        # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
         if (file_extension in ['.json']) | (json_from_csv == True):
             if (file_extension in ['.json']) &  (prepare_for_review == True):
@@ -624,9 +638,14 @@ def prepare_image_or_pdf(
                     all_annotations_object = json.loads(file_path)  # Use loads for string content
             # Assume it's a textract json
-            elif (file_extension == '.json') and (prepare_for_review is not True):
                 # Copy it to the output folder so it can be used later.
-                out_textract_path = os.path.join(output_folder, file_path_without_ext + "_textract.json")
                 # Use shutil to copy the file directly
                 shutil.copy2(file_path, out_textract_path)  # Preserves metadata
@@ -748,11 +767,11 @@ def prepare_image_or_pdf(
         print(out_time)
         out_message.append(out_time)
-        out_message_out = '\n'.join(out_message)
-    number_of_pages = len(image_file_paths)
-    return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
     file_path_without_ext = get_file_name_without_type(in_file_path)

     widths = [result[2] for result in results]
     heights = [result[3] for result in results]
+    #print("PDF has been converted to images.")
     return images, widths, heights, results
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
     # Check if the file is a PDF
     elif file_extension == '.pdf':
+        # print(f"{file_path} is a PDF file. Converting to image set")
         # Run your function for processing PDF files here
         img_path, image_sizes_width, image_sizes_height, all_img_details = convert_pdf_to_images(file_path, prepare_for_review, input_folder=input_folder, create_images=create_images)
         pymupdf_page = pymupdf_doc.load_page(page_no)
         original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
+        # Create a page_sizes_object. If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
+        out_page_image_sizes = {
+            "page":reported_page_no,
+            "mediabox_width":pymupdf_page.mediabox.width,
+            "mediabox_height": pymupdf_page.mediabox.height,
+            "cropbox_width":pymupdf_page.cropbox.width,
+            "cropbox_height":pymupdf_page.cropbox.height,
+            "original_cropbox":original_cropboxes[-1],
+            "image_path":image_file_paths[page_no]}
+        # cropbox_x_offset: Distance from MediaBox left edge to CropBox left edge
+        # This is simply the difference in their x0 coordinates.
+        out_page_image_sizes['cropbox_x_offset'] = pymupdf_page.cropbox.x0 - pymupdf_page.mediabox.x0
+        # cropbox_y_offset_from_top: Distance from MediaBox top edge to CropBox top edge
+        # MediaBox top y = mediabox.y1
+        # CropBox top y = cropbox.y1
+        # The difference is mediabox.y1 - cropbox.y1
+        out_page_image_sizes['cropbox_y_offset_from_top'] = pymupdf_page.mediabox.y1 - pymupdf_page.cropbox.y1
         if image_sizes_width and image_sizes_height:
+            out_page_image_sizes["image_width"] = image_sizes_width[page_no]
+            out_page_image_sizes["image_height"] = image_sizes_height[page_no]
         page_sizes.append(out_page_image_sizes)
     latest_file_completed: int = 0,
     out_message: List[str] = [],
     first_loop_state: bool = False,
+    number_of_pages:int = 0,
     all_annotations_object:List = [],
     prepare_for_review:bool = False,
     in_fully_redacted_list:List[int]=[],
     all_img_details = []
     review_file_csv = pd.DataFrame()
     all_line_level_ocr_results_df = pd.DataFrame()
+    out_textract_path = ""
+    combined_out_message = ""
+    final_out_message = ""
     if isinstance(in_fully_redacted_list, pd.DataFrame):
         if not in_fully_redacted_list.empty:
     else:
         print("Now redacting file", str(latest_file_completed))
+    # If combined out message or converted_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str): out_message = [out_message]
     if not file_paths: file_paths = []
         file_paths_list = [file_paths]
         file_paths_loop = file_paths_list
     else:
+        file_paths_list = file_paths
+        file_paths_loop = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
     # Loop through files to load in
     for file in file_paths_loop:
         converted_file_path = []
             image_file_paths, image_sizes_width, image_sizes_height, all_img_details = process_file_for_image_creation(file_path_str, prepare_for_review, input_folder, create_images=True)
             # Create a page_sizes_object
             page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height, image_file_paths)
                 json_from_csv = False
         # NEW IF STATEMENT
+        # If the file name ends with .json, check if we are loading for review. If yes, assume it is an annoations object, overwrite the current annotations object. If false, assume this is a Textract object, load in to Textract
         if (file_extension in ['.json']) | (json_from_csv == True):
             if (file_extension in ['.json']) &  (prepare_for_review == True):
                     all_annotations_object = json.loads(file_path)  # Use loads for string content
             # Assume it's a textract json
+            elif (file_extension in ['.json']) and (prepare_for_review != True):
+                print("Saving Textract output")
                 # Copy it to the output folder so it can be used later.
+                output_textract_json_file_name = file_path_without_ext
+                if not file_path.endswith("_textract.json"): output_textract_json_file_name = file_path_without_ext + "_textract.json"
+                else: output_textract_json_file_name = file_path_without_ext + ".json"
+                out_textract_path = os.path.join(output_folder, output_textract_json_file_name)
                 # Use shutil to copy the file directly
                 shutil.copy2(file_path, out_textract_path)  # Preserves metadata
         print(out_time)
         out_message.append(out_time)
+        combined_out_message = '\n'.join(out_message)
+    number_of_pages = len(page_sizes)#len(image_file_paths)
+    return combined_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes, textract_output_found, all_img_details, all_line_level_ocr_results_df
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi, output_folder:str=OUTPUT_FOLDER, input_folder:str=INPUT_FOLDER):
     file_path_without_ext = get_file_name_without_type(in_file_path)

tools/file_redaction.py CHANGED Viewed

@@ -205,7 +205,7 @@ def choose_and_run_redactor(file_paths:List[str],
     latest_file_completed = int(latest_file_completed)
     if isinstance(file_paths,str): number_of_files = 1
-    else: number_of_files = len(file_paths)
     # If we have already redacted the last file, return the input out_message and file list to the relevant outputs
     if latest_file_completed >= number_of_files:
@@ -330,7 +330,7 @@ def choose_and_run_redactor(file_paths:List[str],
     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
-    if pii_identification_method == "AWS Comprehend":
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
             comprehend_client = boto3.client('comprehend',
@@ -349,7 +349,8 @@ def choose_and_run_redactor(file_paths:List[str],
             out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
             print(out_message)
             raise Exception(out_message)
-    else: comprehend_client = ""
     # Try to connect to AWS Textract Client if using that text extraction method
     if text_extraction_method == textract_option:
@@ -365,13 +366,17 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Getting Textract credentials from environment variables.")
             textract_client = boto3.client('textract',
                 aws_access_key_id=AWS_ACCESS_KEY,
-                aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
         else:
             textract_client = ""
-            out_message_warning = "Cannot connect to AWS Textract service."
-            print(out_message_warning)
-            #raise Warning(out_message)
-    else: textract_client = ""
     # Check if output_folder exists, create it if it doesn't
     if not os.path.exists(output_folder): os.makedirs(output_folder)
@@ -764,28 +769,66 @@ def move_page_info(file_path: str) -> str:
     return new_file_path
-def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict, image:Image):
     '''
     Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
     '''
     img_annotation_box = {}
     if image:
         pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
     else:
-        pymupdf_x1 = annot.left
-        pymupdf_x2 = annot.left + annot.width
-        pymupdf_y1 = annot.top
-        pymupdf_y2 = annot.top + annot.height
-    x1 = pymupdf_x1
-    x2 = pymupdf_x2
-    img_annotation_box["xmin"] = annot.left
-    img_annotation_box["ymin"] = annot.top
-    img_annotation_box["xmax"] = annot.left + annot.width
-    img_annotation_box["ymax"] = annot.top + annot.height
     img_annotation_box["color"] = (0,0,0)
     try:
         img_annotation_box["label"] = str(annot.entity_type)
@@ -795,12 +838,11 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
     if hasattr(annot, 'text') and annot.text:
         img_annotation_box["text"] = str(annot.text)
     else:
-        img_annotation_box["text"] = ""
-    rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)  # Create the PyMuPDF Rect
     return img_annotation_box, rect
 def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
     '''
     Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
@@ -951,8 +993,9 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
                 rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2)  # Create the PyMuPDF Rect
             # Else should be CustomImageRecognizerResult
-            elif isinstance(annot, CustomImageRecognizerResult):
-                img_annotation_box, rect = prepare_custom_image_recogniser_result_annotation_box(page, annot, image)
         # Else it should be a pikepdf annotation object
         else:
@@ -1170,8 +1213,7 @@ def redact_image_pdf(file_path:str,
     tic = time.perf_counter()
-    file_name = get_file_name_without_type(file_path)
     comprehend_query_number_new = 0
     # Update custom word list analyser object with any new words that have been added to the custom deny list
@@ -1211,7 +1253,7 @@ def redact_image_pdf(file_path:str,
     # If running Textract, check if file already exists. If it does, load in existing data
     if text_extraction_method == textract_option:
         textract_json_file_path = output_folder + file_name + "_textract.json"
-        textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths)
         original_textract_data = textract_data.copy()
     ###
@@ -1285,6 +1327,8 @@ def redact_image_pdf(file_path:str,
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == textract_option:
                 if not textract_data:
                     try:
                         # Convert the image_path to bytes using an in-memory buffer
@@ -1327,12 +1371,15 @@ def redact_image_pdf(file_path:str,
                             textract_data["pages"].append(text_blocks)
                         except Exception as e:
-                            print("Textract extraction for page", reported_page_number, "failed due to:", e)
                             text_blocks = []
-                            new_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
-                            if "pages" not in textract_data: textract_data["pages"] = []
                         request_metadata = request_metadata + "\n" + new_request_metadata

     latest_file_completed = int(latest_file_completed)
     if isinstance(file_paths,str): number_of_files = 1
+    else: number_of_files = len(file_paths_list)
     # If we have already redacted the last file, return the input out_message and file list to the relevant outputs
     if latest_file_completed >= number_of_files:
     # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
+    if pii_identification_method == aws_pii_detector:
         if aws_access_key_textbox and aws_secret_key_textbox:
             print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
             comprehend_client = boto3.client('comprehend',
             out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
             print(out_message)
             raise Exception(out_message)
+    else:
+        comprehend_client = ""
     # Try to connect to AWS Textract Client if using that text extraction method
     if text_extraction_method == textract_option:
             print("Getting Textract credentials from environment variables.")
             textract_client = boto3.client('textract',
                 aws_access_key_id=AWS_ACCESS_KEY,
+                aws_secret_access_key=AWS_SECRET_KEY, region_name=AWS_REGION)
+        elif textract_output_found==True:
+            print("Existing Textract data found for file, no need to connect to AWS Textract")
+            textract_client = boto3.client('textract', region_name=AWS_REGION)
         else:
             textract_client = ""
+            out_message = "Cannot connect to AWS Textract service."
+            print(out_message)
+            raise Exception(out_message)
+    else:
+        textract_client = ""
     # Check if output_folder exists, create it if it doesn't
     if not os.path.exists(output_folder): os.makedirs(output_folder)
     return new_file_path
+def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict, image:Image, page_sizes_df:pd.DataFrame):
     '''
     Prepare an image annotation box and coordinates based on a CustomImageRecogniserResult, PyMuPDF page, and PIL Image.
     '''
     img_annotation_box = {}
+    # For efficient lookup, set 'page' as index if it's not already
+    if 'page' in page_sizes_df.columns:
+        page_sizes_df = page_sizes_df.set_index('page')
+    # PyMuPDF page numbers are 0-based, DataFrame index assumed 1-based
+    page_num_one_based = page.number + 1
+    pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = 0, 0, 0, 0 # Initialize defaults
     if image:
         pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
     else:
+        # --- Calculate coordinates when no image is present ---
+        # Assumes annot coords are normalized relative to MediaBox (top-left origin)
+        try:
+            # 1. Get MediaBox dimensions from the DataFrame
+            page_info = page_sizes_df.loc[page_num_one_based]
+            mb_width = page_info['mediabox_width']
+            mb_height = page_info['mediabox_height']
+            x_offset = page_info['cropbox_x_offset']
+            y_offset = page_info['cropbox_y_offset_from_top']
+            # Check for invalid dimensions
+            if mb_width <= 0 or mb_height <= 0:
+                print(f"Warning: Invalid MediaBox dimensions ({mb_width}x{mb_height}) for page {page_num_one_based}. Setting coords to 0.")
+            else:
+                pymupdf_x1 = annot.left - x_offset
+                pymupdf_x2 = annot.left + annot.width - x_offset
+                pymupdf_y1 = annot.top - y_offset
+                pymupdf_y2 = annot.top + annot.height - y_offset
+        except KeyError:
+            print(f"Warning: Page number {page_num_one_based} not found in page_sizes_df. Cannot get MediaBox dimensions. Setting coords to 0.")
+        except AttributeError as e:
+             print(f"Error accessing attributes ('left', 'top', etc.) on 'annot' object for page {page_num_one_based}: {e}")
+        except Exception as e:
+            print(f"Error during coordinate calculation for page {page_num_one_based}: {e}")
+    rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2)  # Create the PyMuPDF Rect
+    # Now creating image annotation object
+    image_x1 = annot.left
+    image_x2 = annot.left + annot.width
+    image_y1 = annot.top
+    image_y2 = annot.top + annot.height
+    # Create image annotation boxes
+    img_annotation_box["xmin"] = image_x1
+    img_annotation_box["ymin"] = image_y1
+    img_annotation_box["xmax"] = image_x2 # annot.left + annot.width
+    img_annotation_box["ymax"] = image_y2 # annot.top + annot.height
     img_annotation_box["color"] = (0,0,0)
     try:
         img_annotation_box["label"] = str(annot.entity_type)
     if hasattr(annot, 'text') and annot.text:
         img_annotation_box["text"] = str(annot.text)
     else:
+        img_annotation_box["text"] = ""
     return img_annotation_box, rect
 def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict, image:Image=None, convert_pikepdf_to_pymupdf_coords:bool=True, page_sizes_df:pd.DataFrame=pd.DataFrame(), image_dimensions:dict={}):
     '''
     Convert redaction objects with pikepdf coordinates to annotation boxes for PyMuPDF that can then be redacted from the document. First 1. converts pikepdf to pymupdf coordinates, then 2. converts pymupdf coordinates to image coordinates if page is an image.
                 rect = Rect(pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2)  # Create the PyMuPDF Rect
             # Else should be CustomImageRecognizerResult
+            elif isinstance(annot, CustomImageRecognizerResult):
+                #print("annot is a CustomImageRecognizerResult")
+                img_annotation_box, rect = prepare_custom_image_recogniser_result_annotation_box(page, annot, image, page_sizes_df)
         # Else it should be a pikepdf annotation object
         else:
     tic = time.perf_counter()
+    file_name = get_file_name_without_type(file_path)
     comprehend_query_number_new = 0
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     # If running Textract, check if file already exists. If it does, load in existing data
     if text_extraction_method == textract_option:
         textract_json_file_path = output_folder + file_name + "_textract.json"
+        textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
         original_textract_data = textract_data.copy()
     ###
             # Check if page exists in existing textract data. If not, send to service to analyse
             if text_extraction_method == textract_option:
+                text_blocks = []
                 if not textract_data:
                     try:
                         # Convert the image_path to bytes using an in-memory buffer
                             textract_data["pages"].append(text_blocks)
                         except Exception as e:
+                            out_message = "Textract extraction for page " + reported_page_number + " failed due to:" + str(e)
+                            print(out_message)
                             text_blocks = []
+                            new_request_metadata = "Failed Textract API call"
                             # Check if "pages" key exists, if not, initialise it as an empty list
+                            if "pages" not in textract_data: textract_data["pages"] = []
+                            raise Exception(out_message)
                         request_metadata = request_metadata + "\n" + new_request_metadata

tools/helper_functions.py CHANGED Viewed

@@ -9,7 +9,7 @@ import unicodedata
 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
-from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID
 # Names for options labels
 text_ocr_option = "Local model - selectable text"
@@ -31,7 +31,7 @@ def reset_state_vars():
             show_share_button=False,
             show_remove_button=False,
             interactive=False
-        ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], ""
 def reset_ocr_results_state():
     return pd.DataFrame(), pd.DataFrame(), []
@@ -44,23 +44,54 @@ def load_in_default_allow_list(allow_list_file_path):
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
-def load_in_default_cost_codes(cost_codes_path:str):
     cost_codes_df = pd.read_csv(cost_codes_path)
-    dropdown_choices = cost_codes_df.iloc[:,0].to_list()
-    dropdown_choices.insert(0, "")
-    out_dropdown = gr.Dropdown(value="", label="Choose cost code for analysis", choices=dropdown_choices, allow_custom_value=True)
     return cost_codes_df, cost_codes_df, out_dropdown
-def enforce_cost_codes(enforce_cost_code_textbox, cost_code_choice):
     if enforce_cost_code_textbox == "True":
         if not cost_code_choice:
             raise Exception("Please choose a cost code before continuing")
     return
 def update_dataframe(df:pd.DataFrame):
     df_copy = df.copy()
     return df_copy
@@ -201,10 +232,10 @@ def check_for_existing_textract_file(doc_file_name_no_extension_textbox:str, out
     else:
         return False
-# Following function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
 def add_folder_to_path(folder_path: str):
     '''
-    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist.
     '''
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
@@ -271,7 +302,14 @@ def merge_csv_files(file_list:List[str], output_folder:str=OUTPUT_FOLDER):
     return output_files
-async def get_connection_params(request: gr.Request, output_folder_textbox:str=OUTPUT_FOLDER, input_folder_textbox:str=INPUT_FOLDER, session_output_folder:str=SESSION_OUTPUT_FOLDER):
     #print("Session hash:", request.session_hash)
@@ -323,6 +361,13 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str=O
     if session_output_folder == 'True':
         output_folder = output_folder_textbox + out_session_hash + "/"
         input_folder = input_folder_textbox + out_session_hash + "/"
     else:
         output_folder = output_folder_textbox
         input_folder = input_folder_textbox
@@ -330,8 +375,7 @@ async def get_connection_params(request: gr.Request, output_folder_textbox:str=O
     if not os.path.exists(output_folder): os.mkdir(output_folder)
     if not os.path.exists(input_folder): os.mkdir(input_folder)
-    return out_session_hash, output_folder, out_session_hash, input_folder
 def clean_unicode_text(text:str):
     # Step 1: Normalise unicode characters to decompose any special forms
@@ -374,6 +418,8 @@ def calculate_aws_costs(number_of_pages:str,
                         pii_identification_method:str,
                         textract_output_found_checkbox:bool,
                         only_extract_text_radio:bool,
                         textract_page_cost:float=1.5/1000,
                         textract_signature_cost:float=2.0/1000,
                         comprehend_unit_cost:float=0.0001,
@@ -391,6 +437,8 @@ def calculate_aws_costs(number_of_pages:str,
     - pii_identification_method_drop: The method of personally-identifiable information removal.
     - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
     - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
     - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
     - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
     - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
@@ -419,6 +467,9 @@ def calculate_aws_costs(number_of_pages:str,
     calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
     return calculated_aws_cost
 def calculate_time_taken(number_of_pages:str,

 from typing import List
 from math import ceil
 from gradio_image_annotation import image_annotator
+from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
 # Names for options labels
 text_ocr_option = "Local model - selectable text"
             show_share_button=False,
             show_remove_button=False,
             interactive=False
+        ), [], [], pd.DataFrame(), pd.DataFrame(), [], [], "", False
 def reset_ocr_results_state():
     return pd.DataFrame(), pd.DataFrame(), []
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
+def load_in_default_cost_codes(cost_codes_path:str, default_cost_code:str=""):
+    '''
+    Load in the cost codes list from file.
+    '''
     cost_codes_df = pd.read_csv(cost_codes_path)
+    dropdown_choices = cost_codes_df.iloc[:, 0].astype(str).tolist()
+    # Avoid inserting duplicate or empty cost code values
+    if default_cost_code and default_cost_code not in dropdown_choices:
+        dropdown_choices.insert(0, default_cost_code)
+    # Always have a blank option at the top
+    if "" not in dropdown_choices:
+        dropdown_choices.insert(0, "")
+    out_dropdown = gr.Dropdown(
+        value=default_cost_code if default_cost_code in dropdown_choices else "",
+        label="Choose cost code for analysis",
+        choices=dropdown_choices,
+        allow_custom_value=False
+    )
     return cost_codes_df, cost_codes_df, out_dropdown
+def enforce_cost_codes(enforce_cost_code_textbox:str, cost_code_choice:str, cost_code_df:pd.DataFrame, verify_cost_codes:bool=True):
+    '''
+    Check if the enforce cost codes variable is set to true, and then check that a cost cost has been chosen. If not, raise an error. Then, check against the values in the cost code dataframe to ensure that the cost code exists.
+    '''
     if enforce_cost_code_textbox == "True":
         if not cost_code_choice:
             raise Exception("Please choose a cost code before continuing")
+        if verify_cost_codes == True:
+            if cost_code_df.empty:
+                raise Exception("No cost codes present in dataframe for verification")
+            else:
+                valid_cost_codes_list = list(cost_code_df.iloc[:,0].unique())
+                if not cost_code_choice in valid_cost_codes_list:
+                    raise Exception("Selected cost code not found in list. Please contact Finance if you cannot find the correct cost code from the given list of suggestions.")
     return
+def update_cost_code_dataframe_from_dropdown_select(cost_dropdown_selection:str, cost_code_df:pd.DataFrame):
+    cost_code_df = cost_code_df.loc[cost_code_df.iloc[:,0] == cost_dropdown_selection, :
+                                    ]
+    return cost_code_df
 def update_dataframe(df:pd.DataFrame):
     df_copy = df.copy()
     return df_copy
     else:
         return False
+#
 def add_folder_to_path(folder_path: str):
     '''
+    Check if a folder exists on your system. If so, get the absolute path and then add it to the system Path variable if it doesn't already exist. Function is only relevant for locally-created executable files based on this app (when using pyinstaller it creates a _internal folder that contains tesseract and poppler. These need to be added to the system path to enable the app to run)
     '''
     if os.path.exists(folder_path) and os.path.isdir(folder_path):
     return output_files
+async def get_connection_params(request: gr.Request,
+                                output_folder_textbox:str=OUTPUT_FOLDER,
+                                input_folder_textbox:str=INPUT_FOLDER,
+                                session_output_folder:str=SESSION_OUTPUT_FOLDER,
+                                textract_document_upload_input_folder:str=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER,
+                                textract_document_upload_output_folder:str=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER,
+                                s3_textract_document_logs_subfolder:str=TEXTRACT_JOBS_S3_LOC,
+                                local_textract_document_logs_subfolder:str=TEXTRACT_JOBS_LOCAL_LOC):
     #print("Session hash:", request.session_hash)
     if session_output_folder == 'True':
         output_folder = output_folder_textbox + out_session_hash + "/"
         input_folder = input_folder_textbox + out_session_hash + "/"
+        textract_document_upload_input_folder = textract_document_upload_input_folder + "/" + out_session_hash
+        textract_document_upload_output_folder = textract_document_upload_output_folder + "/" + out_session_hash
+        s3_textract_document_logs_subfolder = s3_textract_document_logs_subfolder + "/" + out_session_hash
+        local_textract_document_logs_subfolder = local_textract_document_logs_subfolder + "/" + out_session_hash + "/"
     else:
         output_folder = output_folder_textbox
         input_folder = input_folder_textbox
     if not os.path.exists(output_folder): os.mkdir(output_folder)
     if not os.path.exists(input_folder): os.mkdir(input_folder)
+    return out_session_hash, output_folder, out_session_hash, input_folder, textract_document_upload_input_folder, textract_document_upload_output_folder, s3_textract_document_logs_subfolder, local_textract_document_logs_subfolder
 def clean_unicode_text(text:str):
     # Step 1: Normalise unicode characters to decompose any special forms
                         pii_identification_method:str,
                         textract_output_found_checkbox:bool,
                         only_extract_text_radio:bool,
+                        convert_to_gbp:bool=True,
+                        usd_gbp_conversion_rate:float=0.76,
                         textract_page_cost:float=1.5/1000,
                         textract_signature_cost:float=2.0/1000,
                         comprehend_unit_cost:float=0.0001,
     - pii_identification_method_drop: The method of personally-identifiable information removal.
     - textract_output_found_checkbox: Whether existing Textract results have been found in the output folder. Assumes that results exist for all pages and files in the output folder.
     - only_extract_text_radio (bool, optional): Option to only extract text from the document rather than redact.
+    - convert_to_gbp (bool, optional): Should suggested costs be converted from USD to GBP.
+    - usd_gbp_conversion_rate (float, optional): Conversion rate used for USD to GBP. Last changed 14th April 2025.
     - textract_page_cost (float, optional): AWS pricing for Textract text extraction per page ($).
     - textract_signature_cost (float, optional): Additional AWS cost above standard AWS Textract extraction for extracting signatures.
     - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
     calculated_aws_cost = calculated_aws_cost + text_extraction_cost + pii_identification_cost
+    if convert_to_gbp == True:
+        calculated_aws_cost *= usd_gbp_conversion_rate
     return calculated_aws_cost
 def calculate_time_taken(number_of_pages:str,

tools/redaction_review.py CHANGED Viewed

@@ -577,7 +577,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
                 output_files.append(orig_pdf_file_path)
         try:
-            print("Saving review file.")
             review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image",	"page",	"label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image",	"page",	"text",	"label","color", "xmin", "ymin", "xmax", "ymax"])
             out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
@@ -756,6 +756,18 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         return row_value_page, row_value_df
 def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
         row_value_code = evt.row_value[0] # This is the value for cost code

                 output_files.append(orig_pdf_file_path)
         try:
+            #print("Saving review file.")
             review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image",	"page",	"label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image",	"page",	"text",	"label","color", "xmin", "ymin", "xmax", "ymax"])
             out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
         return row_value_page, row_value_df
+def df_select_callback_textract_api(df: pd.DataFrame, evt: gr.SelectData):
+        #print("evt.data:", evt._data)
+        row_value_job_id = evt.row_value[0] # This is the page number value
+        # row_value_label = evt.row_value[1] # This is the label number value
+        row_value_job_type = evt.row_value[2] # This is the text number value
+        row_value_df = pd.DataFrame(data={"job_id":[row_value_job_id], "label":[row_value_job_type]})
+        return row_value_job_id, row_value_job_type, row_value_df
 def df_select_callback_cost(df: pd.DataFrame, evt: gr.SelectData):
         row_value_code = evt.row_value[0] # This is the value for cost code

tools/textract_batch_call.py CHANGED Viewed

@@ -1,22 +1,36 @@
 import boto3
 import time
 import os
 import json
 import logging
 from urllib.parse import urlparse
-# Configure logging
-logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-def analyze_pdf_with_textract(
     local_pdf_path: str,
-    s3_bucket_name: str,
     s3_input_prefix: str,
     s3_output_prefix: str,
-    local_output_dir: str,
-    aws_region: str = None, # Optional: specify region if not default
-    poll_interval_seconds: int = 5,
-    max_polling_attempts: int = 120 # ~10 minutes total wait time
     ):
     """
     Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
@@ -27,10 +41,12 @@ def analyze_pdf_with_textract(
         s3_bucket_name (str): Name of the S3 bucket to use.
         s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
         s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
-        local_output_dir (str): Local directory to save the downloaded JSON results.
         aws_region (str, optional): AWS region name. Defaults to boto3 default region.
-        poll_interval_seconds (int): Seconds to wait between polling Textract status.
-        max_polling_attempts (int): Maximum number of times to poll Textract status.
     Returns:
         str: Path to the downloaded local JSON output file, or None if failed.
@@ -41,12 +57,21 @@ def analyze_pdf_with_textract(
         Exception: For other AWS errors or job failures.
     """
     if not os.path.exists(local_pdf_path):
-        raise FileNotFoundError(f"Input PDF not found: {local_pdf_path}")
     if not os.path.exists(local_output_dir):
         os.makedirs(local_output_dir)
-        logging.info(f"Created local output directory: {local_output_dir}")
     # Initialize boto3 clients
     session = boto3.Session(region_name=aws_region)
@@ -57,216 +82,407 @@ def analyze_pdf_with_textract(
     pdf_filename = os.path.basename(local_pdf_path)
     s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
-    logging.info(f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'...")
     try:
         s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
-        logging.info("Upload successful.")
     except Exception as e:
-        logging.error(f"Failed to upload PDF to S3: {e}")
         raise
     # --- 2. Start Textract Document Analysis ---
-    logging.info("Starting Textract document analysis job...")
     try:
-        response = textract_client.start_document_analysis(
-            DocumentLocation={
-                'S3Object': {
-                    'Bucket': s3_bucket_name,
-                    'Name': s3_input_key
                 }
-            },
-            FeatureTypes=['SIGNATURES', 'FORMS', 'TABLES'], # Analyze for signatures, forms, and tables
-            OutputConfig={
-                'S3Bucket': s3_bucket_name,
-                'S3Prefix': s3_output_prefix
-            }
-            # Optional: Add NotificationChannel for SNS topic notifications
-            # NotificationChannel={
-            #     'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
-            #     'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
-            # }
-        )
-        job_id = response['JobId']
-        logging.info(f"Textract job started with JobId: {job_id}")
-    except Exception as e:
-        logging.error(f"Failed to start Textract job: {e}")
-        raise
-    # --- 3. Poll for Job Completion ---
-    job_status = 'IN_PROGRESS'
-    attempts = 0
-    logging.info("Polling Textract for job completion status...")
-    while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
-        attempts += 1
-        try:
-            response = textract_client.get_document_analysis(JobId=job_id)
-            job_status = response['JobStatus']
-            logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
-            if job_status == 'IN_PROGRESS':
-                time.sleep(poll_interval_seconds)
-            elif job_status == 'SUCCEEDED':
-                logging.info("Textract job succeeded.")
-                break
-            elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
-                 status_message = response.get('StatusMessage', 'No status message provided.')
-                 warnings = response.get('Warnings', [])
-                 logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
-                 if warnings:
-                     logging.warning(f"Warnings: {warnings}")
-                 # Decide if PARTIAL_SUCCESS should proceed or raise error
-                 # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
-                 raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
-            else:
-                # Should not happen based on documentation, but handle defensively
-                raise Exception(f"Unexpected Textract job status: {job_status}")
-        except textract_client.exceptions.InvalidJobIdException:
-             logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
-             raise
-        except Exception as e:
-             logging.error(f"Error while polling Textract status for job {job_id}: {e}")
-             raise
-    if job_status != 'SUCCEEDED':
-        raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
-    # --- 4. Download Output JSON from S3 ---
-    # Textract typically creates output under s3_output_prefix/job_id/
-    # There might be multiple JSON files if pagination occurred during writing.
-    # Usually, for smaller docs, there's one file, often named '1'.
-    # For robust handling, list objects and find the JSON(s).
-    s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
-    logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
-    downloaded_file_path = None
-    try:
         list_response = s3_client.list_objects_v2(
             Bucket=s3_bucket_name,
             Prefix=s3_output_key_prefix
         )
         output_files = list_response.get('Contents', [])
-        if not output_files:
-            # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
-            logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
-            time.sleep(5)
-            list_response = s3_client.list_objects_v2(
-                Bucket=s3_bucket_name,
-                Prefix=s3_output_key_prefix
-            )
-            output_files = list_response.get('Contents', [])
-        if not output_files:
-             logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
-             # You could alternatively try getting results via get_document_analysis pagination here
-             # but sticking to the request to download from S3 output path.
-             raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
-        # Usually, we only need the first/main JSON output file(s)
-        # For simplicity, download the first one found. A more complex scenario might merge multiple files.
-        # Filter out potential directory markers if any key ends with '/'
-        json_files_to_download = [f for f in output_files if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/')]
-        if not json_files_to_download:
-            logging.error(f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}")
-            raise FileNotFoundError(f"Textract output JSON files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
-        # Let's download the first JSON found. Often it's the only one or the main one.
-        s3_output_key = json_files_to_download[0]['Key']
-        output_filename_base = os.path.basename(pdf_filename).replace('.pdf', '')
-        local_output_filename = f"{output_filename_base}_textract_output_{job_id}.json"
-        local_output_path = os.path.join(local_output_dir, local_output_filename)
-        logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
-        s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
-        logging.info("Download successful.")
-        downloaded_file_path = local_output_path
-        # Log if multiple files were found, as user might need to handle them
-        if len(json_files_to_download) > 1:
-            logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
-    except Exception as e:
-        logging.error(f"Failed to download or process Textract output from S3: {e}")
-        raise
     return downloaded_file_path
-# --- Example Usage ---
-if __name__ == '__main__':
-    # --- Configuration --- (Replace with your actual values)
-    MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
-    MY_S3_BUCKET = "your-textract-demo-bucket-name" # MUST BE UNIQUE GLOBALLY
-    MY_S3_INPUT_PREFIX = "textract-inputs"          # Folder in the bucket for uploads
-    MY_S3_OUTPUT_PREFIX = "textract-outputs"        # Folder in the bucket for results
-    MY_LOCAL_OUTPUT_DIR = "./textract_results"      # Local folder to save JSON
-    MY_AWS_REGION = "us-east-1"                     # e.g., 'us-east-1', 'eu-west-1'
-    # --- Create a dummy PDF for testing if you don't have one ---
-    # Requires 'reportlab' library: pip install reportlab
-    try:
-        from reportlab.pdfgen import canvas
-        from reportlab.lib.pagesizes import letter
-        if not os.path.exists(MY_LOCAL_PDF):
-             print(f"Creating dummy PDF: {MY_LOCAL_PDF}")
-             c = canvas.Canvas(MY_LOCAL_PDF, pagesize=letter)
-             c.drawString(100, 750, "This is a test document for AWS Textract.")
-             c.drawString(100, 700, "It includes some text and a placeholder for a signature.")
-             c.drawString(100, 650, "Signed:")
-             # Draw a simple line/scribble for signature placeholder
-             c.line(150, 630, 250, 645)
-             c.line(250, 645, 300, 620)
-             c.save()
-             print("Dummy PDF created.")
-    except ImportError:
-        if not os.path.exists(MY_LOCAL_PDF):
-            print(f"Warning: reportlab not installed and '{MY_LOCAL_PDF}' not found. Cannot run example without an input PDF.")
-            exit() # Exit if no PDF available for the example
-    except Exception as e:
-         print(f"Error creating dummy PDF: {e}")
-         exit()
-    # --- Run the analysis ---
-    try:
-        output_json_path = analyze_pdf_with_textract(
-            local_pdf_path=MY_LOCAL_PDF,
-            s3_bucket_name=MY_S3_BUCKET,
-            s3_input_prefix=MY_S3_INPUT_PREFIX,
-            s3_output_prefix=MY_S3_OUTPUT_PREFIX,
-            local_output_dir=MY_LOCAL_OUTPUT_DIR,
-            aws_region=MY_AWS_REGION
-        )
-        if output_json_path:
-            print(f"\n--- Analysis Complete ---")
-            print(f"Textract output JSON saved to: {output_json_path}")
-            # Optional: Load and print some info from the JSON
-            with open(output_json_path, 'r') as f:
-                results = json.load(f)
-            print(f"Detected {results.get('DocumentMetadata', {}).get('Pages', 'N/A')} page(s).")
-            # Find signature blocks (Note: This is basic, real parsing might be more complex)
-            signature_blocks = [block for block in results.get('Blocks', []) if block.get('BlockType') == 'SIGNATURE']
-            print(f"Found {len(signature_blocks)} potential signature block(s).")
-            if signature_blocks:
-                 print(f"First signature confidence: {signature_blocks[0].get('Confidence', 'N/A')}")
-    except FileNotFoundError as e:
-        print(f"\nError: Input file not found. {e}")
-    except Exception as e:
-        print(f"\nAn error occurred during the process: {e}")
-import boto3
-import time
-import os
-def download_textract_output(job_id, output_bucket, output_prefix, local_folder):
     """
     Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
@@ -290,8 +506,8 @@ def download_textract_output(job_id, output_bucket, output_prefix, local_folder)
             print("Job failed:", response.get("StatusMessage", "No error message provided."))
             return
         else:
-            print(f"Job is still {status}, waiting...")
-            time.sleep(10)  # Wait before checking again
     # Find output ZIP file in S3
     output_file_key = f"{output_prefix}/{job_id}.zip"
@@ -303,6 +519,3 @@ def download_textract_output(job_id, output_bucket, output_prefix, local_folder)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
         print(f"Error downloading file: {e}")
-# Example usage:
-# download_textract_output("your-job-id", "your-output-bucket", "your-output-prefix", "/path/to/local/folder")

 import boto3
 import time
 import os
+import pandas as pd
 import json
 import logging
+import datetime
+from typing import List
+from io import StringIO
 from urllib.parse import urlparse
+from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
+# MY_LOCAL_PDF = r"C:\path\to\your\document.pdf" # Use raw string for Windows paths
+# MY_S3_BUCKET = TEXTRACT_BULK_ANALYSIS_BUCKET # MUST BE UNIQUE GLOBALLY
+# MY_S3_INPUT_PREFIX = session_hash_textbox          # Folder in the bucket for uploads
+# MY_S3_OUTPUT_PREFIX = session_hash_textbox        # Folder in the bucket for results
+# MY_LOCAL_OUTPUT_DIR = OUTPUT_FOLDER      # Local folder to save JSON
+# MY_AWS_REGION = AWS_REGION                     # e.g., 'us-east-1', 'eu-west-1'
+from tools.config import TEXTRACT_BULK_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
+from tools.aws_textract import json_to_ocrresult
+def analyse_document_with_textract_api(
     local_pdf_path: str,
     s3_input_prefix: str,
     s3_output_prefix: str,
+    job_df:pd.DataFrame,
+    s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
+    local_output_dir: str = OUTPUT_FOLDER,
+    analyse_signatures:List[str] = [],
+    successful_job_number:int=0,
+    general_s3_bucket_name: str = DOCUMENT_REDACTION_BUCKET,
+    aws_region: str = AWS_REGION # Optional: specify region if not default
     ):
     """
     Uploads a local PDF to S3, starts a Textract analysis job (detecting text & signatures),
         s3_bucket_name (str): Name of the S3 bucket to use.
         s3_input_prefix (str): S3 prefix (folder) to upload the input PDF.
         s3_output_prefix (str): S3 prefix (folder) where Textract should write output.
+        job_df (pd.DataFrame): Dataframe containing information from previous Textract API calls.
+        s3_bucket_name (str, optional): S3 bucket in which to save API call outputs.
+        local_output_dir (str, optional): Local directory to save the downloaded JSON results.
+        analyse_signatures (List[str], optional): Analyse signatures? Default is no.
+        successful_job_number (int): The number of successful jobs that have been submitted in this session.
         aws_region (str, optional): AWS region name. Defaults to boto3 default region.
     Returns:
         str: Path to the downloaded local JSON output file, or None if failed.
         Exception: For other AWS errors or job failures.
     """
+    # This is a variable that is written to logs to indicate that a Textract API call was made
+    is_a_textract_api_call = True
+    # Keep only latest pdf path if it's a list
+    if isinstance(local_pdf_path, list):
+        local_pdf_path = local_pdf_path[-1]
     if not os.path.exists(local_pdf_path):
+        raise FileNotFoundError(f"Input document not found {local_pdf_path}")
     if not os.path.exists(local_output_dir):
         os.makedirs(local_output_dir)
+        log_message = f"Created local output directory: {local_output_dir}"
+        print(log_message)
+        #logging.info(log_message)
     # Initialize boto3 clients
     session = boto3.Session(region_name=aws_region)
     pdf_filename = os.path.basename(local_pdf_path)
     s3_input_key = os.path.join(s3_input_prefix, pdf_filename).replace("\\", "/") # Ensure forward slashes for S3
+    log_message = f"Uploading '{local_pdf_path}' to 's3://{s3_bucket_name}/{s3_input_key}'..."
+    print(log_message)
+    #logging.info(log_message)
     try:
         s3_client.upload_file(local_pdf_path, s3_bucket_name, s3_input_key)
+        log_message = "Upload successful."
+        print(log_message)
+        #logging.info(log_message)
     except Exception as e:
+        log_message = f"Failed to upload PDF to S3: {e}"
+        print(log_message)
+        #logging.error(log_message)
         raise
+    # If job_df is not empty
+    if not job_df.empty:
+        if "file_name" in job_df.columns:
+            matching_job_id_file_names = job_df.loc[(job_df["file_name"] == pdf_filename) & (job_df["signature_extraction"].astype(str) == str(analyse_signatures)), "file_name"]
+            if len(matching_job_id_file_names) > 0:
+                    raise Exception("Existing Textract outputs found. No need to re-analyse. Please download existing results from the list")
     # --- 2. Start Textract Document Analysis ---
+    message = "Starting Textract document analysis job..."
+    print(message)
+    #logging.info("Starting Textract document analysis job...")
     try:
+        if "Extract signatures" in analyse_signatures:
+            response = textract_client.start_document_analysis(
+                DocumentLocation={
+                    'S3Object': {
+                        'Bucket': s3_bucket_name,
+                        'Name': s3_input_key
+                    }
+                },
+                FeatureTypes=['SIGNATURES'], # Analyze for signatures, forms, and tables
+                OutputConfig={
+                    'S3Bucket': s3_bucket_name,
+                    'S3Prefix': s3_output_prefix
                 }
+                # Optional: Add NotificationChannel for SNS topic notifications
+                # NotificationChannel={
+                #     'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
+                #     'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
+                # }
+            )
+            job_type="document_analysis"
+        else:
+            response = textract_client.start_document_text_detection(
+                DocumentLocation={
+                    'S3Object': {
+                        'Bucket': s3_bucket_name,
+                        'Name': s3_input_key
+                    }
+                },
+                OutputConfig={
+                    'S3Bucket': s3_bucket_name,
+                    'S3Prefix': s3_output_prefix
+                }
+                # Optional: Add NotificationChannel for SNS topic notifications
+                # NotificationChannel={
+                #     'SNSTopicArn': 'YOUR_SNS_TOPIC_ARN',
+                #     'RoleArn': 'YOUR_IAM_ROLE_ARN_FOR_TEXTRACT_TO_ACCESS_SNS'
+                # }
+            )
+            job_type="document_text_detection"
+        job_id = response['JobId']
+        print(f"Textract job started with JobId: {job_id}")
+        #logging.info(f"Textract job started with JobId: {job_id}")
+        # Write job_id to memory
+        # Prepare CSV in memory
+        log_csv_key_location = f"{s3_output_prefix}/textract_document_jobs.csv"
+        job_location_full = f"s3://{s3_bucket_name}/{s3_output_prefix}/{job_id}/"
+        csv_buffer = StringIO()
+        log_df = pd.DataFrame([{
+            'job_id': job_id,
+            'file_name': pdf_filename,
+            'job_type': job_type,
+            'signature_extraction':analyse_signatures,
+            's3_location': job_location_full,
+            'job_date_time': datetime.datetime.now()
+        }])
+        # File path
+        log_file_path = os.path.join(local_output_dir, "textract_job_log_files.csv")
+        # Check if file exists
+        file_exists = os.path.exists(log_file_path)
+        # Append to CSV if it exists, otherwise write with header
+        log_df.to_csv(log_file_path, mode='a', index=False, header=not file_exists)
+        #log_df.to_csv(csv_buffer)
+        # Upload the file
+        s3_client.upload_file(log_file_path, general_s3_bucket_name, log_csv_key_location)
+        # Upload to S3 (overwrite existing file)
+        #s3_client.put_object(Bucket=general_s3_bucket_name, Key=log_csv_key_location, Body=csv_buffer.getvalue())
+        print(f"Job ID written to {log_csv_key_location}")
+        #logging.info(f"Job ID written to s3://{s3_bucket_name}/{s3_output_prefix}/textract_document_jobs.csv")
+    except Exception as e:
+        error = f"Failed to start Textract job: {e}"
+        print(error)
+        #logging.error(error)
+        raise
+    successful_job_number += 1
+    return f"Textract analysis job submitted, job ID:{job_id}", job_id, job_type, successful_job_number, is_a_textract_api_call
+def return_job_status(job_id:str,
+                     response:dict,
+                     attempts:int,
+                     poll_interval_seconds: int = 5,
+                     max_polling_attempts: int = 1 # ~10 minutes total wait time
+                     ):
+    job_status = response['JobStatus']
+    logging.info(f"Polling attempt {attempts}/{max_polling_attempts}. Job status: {job_status}")
+    if job_status == 'IN_PROGRESS':
+        time.sleep(poll_interval_seconds)
+    elif job_status == 'SUCCEEDED':
+        logging.info("Textract job succeeded.")
+    elif job_status in ['FAILED', 'PARTIAL_SUCCESS']:
+        status_message = response.get('StatusMessage', 'No status message provided.')
+        warnings = response.get('Warnings', [])
+        logging.error(f"Textract job ended with status: {job_status}. Message: {status_message}")
+        if warnings:
+            logging.warning(f"Warnings: {warnings}")
+        # Decide if PARTIAL_SUCCESS should proceed or raise error
+        # For simplicity here, we raise for both FAILED and PARTIAL_SUCCESS
+        raise Exception(f"Textract job {job_id} failed or partially failed. Status: {job_status}. Message: {status_message}")
+    else:
+        # Should not happen based on documentation, but handle defensively
+        raise Exception(f"Unexpected Textract job status: {job_status}")
+    return job_status
+def download_textract_job_files(s3_client:str,
+                                s3_bucket_name:str,
+                                s3_output_key_prefix:str,
+                                pdf_filename:str,
+                                job_id:str,
+                                local_output_dir:str):
+    list_response = s3_client.list_objects_v2(
+        Bucket=s3_bucket_name,
+        Prefix=s3_output_key_prefix
+    )
+    output_files = list_response.get('Contents', [])
+    if not output_files:
+        # Sometimes Textract might take a moment longer to write the output after SUCCEEDED status
+        #logging.warning("No output files found immediately after job success. Waiting briefly and retrying list...")
+        #time.sleep(5)
         list_response = s3_client.list_objects_v2(
             Bucket=s3_bucket_name,
             Prefix=s3_output_key_prefix
         )
         output_files = list_response.get('Contents', [])
+    if not output_files:
+        logging.error(f"No output files found in s3://{s3_bucket_name}/{s3_output_key_prefix}")
+        # You could alternatively try getting results via get_document_analysis pagination here
+        # but sticking to the request to download from S3 output path.
+        raise FileNotFoundError(f"Textract output files not found in S3 path: s3://{s3_bucket_name}/{s3_output_key_prefix}")
+    # Usually, we only need the first/main JSON output file(s)
+    # For simplicity, download the first one found. A more complex scenario might merge multiple files.
+    # Filter out potential directory markers if any key ends with '/'
+    json_files_to_download = [
+    f for f in output_files
+    if f['Key'] != s3_output_key_prefix and not f['Key'].endswith('/') and 'access_check' not in f['Key']
+]
+    #print("json_files_to_download:", json_files_to_download)
+    if not json_files_to_download:
+        error = f"No JSON files found (only prefix marker?) in s3://{s3_bucket_name}/{s3_output_key_prefix}"
+        print(error)
+        #logging.error(error)
+        raise FileNotFoundError(error)
+    combined_blocks = []
+    for f in sorted(json_files_to_download, key=lambda x: x['Key']):  # Optional: sort to ensure consistent order
+        obj = s3_client.get_object(Bucket=s3_bucket_name, Key=f['Key'])
+        data = json.loads(obj['Body'].read())
+        # Assuming Textract-style output with a "Blocks" key
+        if "Blocks" in data:
+            combined_blocks.extend(data["Blocks"])
+        else:
+            logging.warning(f"No 'Blocks' key in file: {f['Key']}")
+    # Build final combined JSON structure
+    combined_output = {
+        "DocumentMetadata": {
+            "Pages": len(set(block.get('Page', 1) for block in combined_blocks))
+        },
+        "Blocks": combined_blocks,
+        "JobStatus": "SUCCEEDED"
+    }
+    output_filename_base = os.path.basename(pdf_filename)
+    output_filename_base_no_ext = os.path.splitext(output_filename_base)[0]
+    local_output_filename = f"{output_filename_base_no_ext}_textract.json"
+    local_output_path = os.path.join(local_output_dir, local_output_filename)
+    with open(local_output_path, 'w') as f:
+        json.dump(combined_output, f)
+    print(f"Combined Textract output written to {local_output_path}")
+    # logging.info(f"Downloading Textract output from 's3://{s3_bucket_name}/{s3_output_key}' to '{local_output_path}'...")
+    # s3_client.download_file(s3_bucket_name, s3_output_key, local_output_path)
+    # logging.info("Download successful.")
+    downloaded_file_path = local_output_path
+    # Log if multiple files were found, as user might need to handle them
+    #if len(json_files_to_download) > 1:
+    #    logging.warning(f"Multiple output files found in S3 output location. Downloaded the first: '{s3_output_key}'. Other files exist.")
     return downloaded_file_path
+def check_for_provided_job_id(job_id:str):
+    if not job_id:
+        raise Exception("Please provide a job ID.")
+    return
+def poll_bulk_textract_analysis_progress_and_download(
+    job_id:str,
+    job_type_dropdown:str,
+    s3_output_prefix: str,
+    pdf_filename:str,
+    job_df:pd.DataFrame,
+    s3_bucket_name: str = TEXTRACT_BULK_ANALYSIS_BUCKET,
+    local_output_dir: str = OUTPUT_FOLDER,
+    load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
+    load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
+    aws_region: str = AWS_REGION, # Optional: specify region if not default
+    poll_interval_seconds: int = 1,
+    max_polling_attempts: int = 1 # ~10 minutes total wait time):
+    ):
+    if job_id:
+        # Initialize boto3 clients
+        session = boto3.Session(region_name=aws_region)
+        s3_client = session.client('s3')
+        textract_client = session.client('textract')
+        # --- 3. Poll for Job Completion ---
+        job_status = 'IN_PROGRESS'
+        attempts = 0
+        message = "Polling Textract for job completion status..."
+        print(message)
+        #logging.info("Polling Textract for job completion status...")
+        # Update Textract document history df
+        try:
+            job_df = load_in_textract_job_details(load_s3_jobs=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
+                                        load_s3_jobs_loc=load_s3_jobs_loc,
+                                        load_local_jobs_loc=load_local_jobs_loc)
+        except Exception as e:
+            #logging.error(f"Failed to update job details dataframe: {e}")
+            print(f"Failed to update job details dataframe: {e}")
+            #raise
+        while job_status == 'IN_PROGRESS' and attempts < max_polling_attempts:
+            attempts += 1
+            try:
+                if job_type_dropdown=="document_analysis":
+                    response = textract_client.get_document_analysis(JobId=job_id)
+                    job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
+                elif job_type_dropdown=="document_text_detection":
+                    response = textract_client.get_document_text_detection(JobId=job_id)
+                    job_status = return_job_status(job_id, response, attempts, poll_interval_seconds, max_polling_attempts)
+                else:
+                    error = f"Unknown job type, cannot poll job"
+                    print(error)
+                    #logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed.")
+                    raise
+            except textract_client.exceptions.InvalidJobIdException:
+                error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than 7 days) or never existed."
+                print(error_message)
+                logging.error(error_message)
+                raise
+            except Exception as e:
+                error_message = f"Error while polling Textract status for job {job_id}: {e}"
+                print(error_message)
+                logging.error(error_message)
+                raise
+        downloaded_file_path = None
+        if job_status == 'SUCCEEDED':
+            #raise TimeoutError(f"Textract job {job_id} did not complete successfully within the polling limit.")
+            # 3b - Replace PDF file name if it exists in the job dataframe
+            # If job_df is not empty
+            if not job_df.empty:
+                if "file_name" in job_df.columns:
+                    matching_job_id_file_names = job_df.loc[job_df["job_id"] == job_id, "file_name"]
+                    if pdf_filename and not matching_job_id_file_names.empty:
+                        if pdf_filename == matching_job_id_file_names.iloc[0]:
+                            raise Exception("Existing Textract outputs found. No need to re-download.")
+                    if not matching_job_id_file_names.empty:
+                        pdf_filename = matching_job_id_file_names.iloc[0]
+                    else:
+                        pdf_filename = "unknown_file"
+            # --- 4. Download Output JSON from S3 ---
+            # Textract typically creates output under s3_output_prefix/job_id/
+            # There might be multiple JSON files if pagination occurred during writing.
+            # Usually, for smaller docs, there's one file, often named '1'.
+            # For robust handling, list objects and find the JSON(s).
+            s3_output_key_prefix = os.path.join(s3_output_prefix, job_id).replace("\\", "/") + "/"
+            logging.info(f"Searching for output files in s3://{s3_bucket_name}/{s3_output_key_prefix}")
+            try:
+                downloaded_file_path = download_textract_job_files(s3_client,
+                                                s3_bucket_name,
+                                                s3_output_key_prefix,
+                                                pdf_filename,
+                                                job_id,
+                                                local_output_dir)
+            except Exception as e:
+                #logging.error(f"Failed to download or process Textract output from S3: {e}")
+                print(f"Failed to download or process Textract output from S3: {e}")
+                raise
+    else:
+        raise Exception("No Job ID provided.")
+    return downloaded_file_path, job_status, job_df
+def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
+                                     load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
+                                     load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
+                                     document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
+                                     aws_region:str=AWS_REGION):
+    job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
+    # Initialize boto3 clients
+    session = boto3.Session(region_name=aws_region)
+    s3_client = session.client('s3')
+    local_output_path = f'{load_local_jobs_loc}/textract_job_log_files.csv'
+    if load_s3_jobs == 'True':
+        s3_output_key = f'{load_s3_jobs_loc}/textract_job_log_files.csv'
+        try:
+            s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
+            print(f"File exists. Downloading from '{s3_output_key}' to '{local_output_path}'...")
+            s3_client.download_file(document_redaction_bucket, s3_output_key, local_output_path)
+            print("Download successful.")
+        except ClientError as e:
+            if e.response['Error']['Code'] == '404':
+                print("Log file does not exist in S3.")
+            else:
+                print(f"Unexpected error occurred: {e}")
+        except (NoCredentialsError, PartialCredentialsError, TokenRetrievalError) as e:
+            print(f"AWS credential issue encountered: {e}")
+            print("Skipping S3 log file download.")
+    # If the log path exists, load it in
+    if os.path.exists(local_output_path):
+        print("Found log file in local path")
+        job_df = pd.read_csv(local_output_path)
+        if "job_date_time" in job_df.columns:
+            job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
+            # Keep only jobs that have been completed in the last 7 days
+            cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=7)
+            job_df = job_df.loc[job_df["job_date_time"] >= cutoff_time,:]
+    return job_df
+def download_textract_output(job_id:str,
+                             output_bucket:str,
+                             output_prefix:str,
+                             local_folder:str):
     """
     Checks the status of a Textract job and downloads the output ZIP file if the job is complete.
             print("Job failed:", response.get("StatusMessage", "No error message provided."))
             return
         else:
+            print(f"Job is still {status}.")
+            #time.sleep(10)  # Wait before checking again
     # Find output ZIP file in S3
     output_file_key = f"{output_prefix}/{job_id}.zip"
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
         print(f"Error downloading file: {e}")