Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Jan 21

Commit

cb349ad

1 Parent(s): 3518b67

Ensured the text ocr outputs have no line breaks at end. Multi-line custom text searches now possible. Files for review sent from redact button. Fixed image redaction (not review yet). Can get user pool details from headers. Gradio update.

Browse files

Files changed (11) hide show

.dockerignore +2 -1
.gitignore +2 -1
app.py +29 -28
requirements.txt +1 -1
tools/auth.py +12 -1
tools/custom_image_analyser_engine.py +643 -399
tools/file_conversion.py +9 -2
tools/file_redaction.py +291 -297
tools/helper_functions.py +26 -4
tools/load_spacy_model_custom_recognisers.py +1 -2
tools/redaction_review.py +22 -14

.dockerignore CHANGED Viewed

@@ -16,4 +16,5 @@ build/*
 dist/*
 build_deps/*
 logs/*
-doc_redaction_amplify_app/*

 dist/*
 build_deps/*
 logs/*
+doc_redaction_amplify_app/*
+user_guide/*

.gitignore CHANGED Viewed

@@ -16,4 +16,5 @@ build/*
 dist/*
 build_deps/*
 logs/*
-doc_redaction_amplify_app/*

 dist/*
 build_deps/*
 logs/*
+doc_redaction_amplify_app/*
+user_guide/*

app.py CHANGED Viewed

@@ -66,26 +66,27 @@ with app:
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
-    all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
-    all_decision_process_table_state = gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
-    prepared_pdf_state = gr.State([])
-    images_pdf_state = gr.State([]) # List of pdf pages converted to PIL images
-    output_image_files_state = gr.State([])
-    output_file_list_state = gr.State([])
-    text_output_file_list_state = gr.State([])
-    log_files_output_list_state = gr.State([])
-    review_file_state = gr.State(pd.DataFrame())
-    do_not_save_pdf_state = gr.State(False)
     # Logging state
     log_file_name = 'log.csv'
@@ -95,7 +96,7 @@ with app:
     access_logs_state = gr.State(access_logs_folder + log_file_name)
     access_s3_logs_loc_state = gr.State(access_logs_folder)
     usage_logs_state = gr.State(usage_logs_folder + log_file_name)
-    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -115,8 +116,7 @@ with app:
     estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
-    s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     ## Annotator zoom value
     annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
@@ -129,16 +129,16 @@ with app:
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
     default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
-    in_allow_list_state = gr.State(pd.DataFrame())
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
-    in_deny_list_state = gr.State([])
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
-    in_fully_redacted_list_state = gr.State([])
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
@@ -209,6 +209,8 @@ with app:
         with gr.Row():
             annotate_zoom_in = gr.Button("Zoom in")
             annotate_zoom_out = gr.Button("Zoom out")
         with gr.Row():
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
@@ -237,18 +239,16 @@ with app:
                 )
         with gr.Row():
-            annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
         #with gr.Column(scale=1):
         with gr.Row():
             recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
             recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
-        with gr.Row():
-            annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
-            annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
-            annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
-            annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
@@ -322,12 +322,12 @@ with app:
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If a file has been completed, the function will continue onto the next document
@@ -394,7 +394,8 @@ with app:
     recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
     then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     ###

     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
+    all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
+    review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
+    do_not_save_pdf_state = gr.State(False)
+    prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
+    images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
+    output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False) #gr.State([])
+    output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
+    text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False) #gr.State([])
+    log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False) #gr.State([])
     # Logging state
     log_file_name = 'log.csv'
     access_logs_state = gr.State(access_logs_folder + log_file_name)
     access_s3_logs_loc_state = gr.State(access_logs_folder)
     usage_logs_state = gr.State(usage_logs_folder + log_file_name)
+    usage_s3_logs_loc_state = gr.State(usage_logs_folder)
     # Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
     estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
+    s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     ## Annotator zoom value
     annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
     default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
+    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
+    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
+    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
         with gr.Row():
             annotate_zoom_in = gr.Button("Zoom in")
             annotate_zoom_out = gr.Button("Zoom out")
+        with gr.Row():
+            annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
         with gr.Row():
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
                 )
         with gr.Row():
+            annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
+            annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
+            annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
+            annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
         #with gr.Column(scale=1):
         with gr.Row():
             recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
             recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
     document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
     current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If a file has been completed, the function will continue onto the next document
     recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
     then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     ###

requirements.txt CHANGED Viewed

@@ -10,7 +10,7 @@ pandas==2.2.3
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.10.0
 boto3==1.35.83
 pyarrow==18.1.0
 openpyxl==3.1.2

 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.12.0
 boto3==1.35.83
 pyarrow==18.1.0
 openpyxl==3.1.2

tools/auth.py CHANGED Viewed

@@ -1,10 +1,21 @@
 import boto3
 import gradio as gr
 import hmac
 import hashlib
 import base64
-from tools.helper_functions import get_or_create_env_var
 client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
 #print(f'The value of AWS_CLIENT_ID is {client_id}')

+import os
 import boto3
 import gradio as gr
 import hmac
 import hashlib
 import base64
+def get_or_create_env_var(var_name, default_value):
+    # Get the environment variable if it exists
+    value = os.environ.get(var_name)
+    # If it doesn't exist, set it to the default value
+    if value is None:
+        os.environ[var_name] = default_value
+        value = default_value
+    return value
 client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
 #print(f'The value of AWS_CLIENT_ID is {client_id}')

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -1,20 +1,19 @@
 import pytesseract
 import numpy as np
 from presidio_analyzer import AnalyzerEngine, RecognizerResult
-#from presidio_image_redactor import ImagePreprocessor
 from typing import List, Dict, Optional, Union, Tuple
 from dataclasses import dataclass
 import time
 import cv2
 import PIL
-from PIL import ImageDraw, ImageFont, Image
 from typing import Optional, Tuple, Union
-from copy import deepcopy
 from tools.helper_functions import clean_unicode_text
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
-#import string  # Import string to get a list of common punctuation characters
-import re  # Add this import at the top of the file
 @dataclass
 class OCRResult:
@@ -174,7 +173,6 @@ class BilateralFilter(ImagePreprocessor):
         return Image.fromarray(filtered_image), metadata
 class SegmentedAdaptiveThreshold(ImagePreprocessor):
     """SegmentedAdaptiveThreshold class.
@@ -252,9 +250,6 @@ class SegmentedAdaptiveThreshold(ImagePreprocessor):
         metadata = {"C": c, "background_color": background_color, "contrast": contrast}
         return Image.fromarray(adaptive_threshold_image), metadata
 class ImageRescaling(ImagePreprocessor):
     """ImageRescaling class. Rescales images based on their size."""
@@ -302,7 +297,6 @@ class ImageRescaling(ImagePreprocessor):
         metadata = {"scale_factor": scale_factor}
         return Image.fromarray(rescaled_image), metadata
 class ContrastSegmentedImageEnhancer(ImagePreprocessor):
     """Class containing all logic to perform contrastive segmentation.
@@ -409,6 +403,464 @@ def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
 class CustomImageAnalyzerEngine:
     def __init__(
@@ -463,261 +915,225 @@ class CustomImageAnalyzerEngine:
         self,
         line_level_ocr_results: List[OCRResult],
         ocr_results_with_children: Dict[str, Dict],
-        chosen_redact_comprehend_entities:List[str],
-        pii_identification_method:str="Local",
-        comprehend_client="",
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
-        # Define English as default language, if not specified
-        if "language" not in text_analyzer_kwargs:
-            text_analyzer_kwargs["language"] = "en"
-        horizontal_buffer = 0 # add pixels to right of width
-        height_buffer = 2 # add pixels to bounding box height
         comprehend_query_number = 0
-        allow_list = text_analyzer_kwargs.get('allow_list', [])
-        combined_results = []
-        # Initialize variables for batching
-        current_batch = ""
-        current_batch_mapping = []  # List of (start_pos, line_index, original_text) tuples
-        analyzer_results_by_line = [[] for _ in line_level_ocr_results]  # Store results for each line
-        # Process OCR results in batches
         for i, line_level_ocr_result in enumerate(line_level_ocr_results):
-            if pii_identification_method == "Local":
-                analyzer_result = self.analyzer_engine.analyze(
-                    text=line_level_ocr_result.text, **text_analyzer_kwargs
-                )
-                analyzer_results_by_line[i] = analyzer_result
-            elif pii_identification_method == "AWS Comprehend":
-                # If using AWS Comprehend, Spacy model is only used to identify the custom entities created. This is because Comprehend can't pick up Titles, Streetnames, and UKPostcodes, or a custom deny list specifically
-                text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
-                spacy_analyzer_result = self.analyzer_engine.analyze(
-                text=line_level_ocr_result.text, **text_analyzer_kwargs)
-                analyzer_results_by_line[i].extend(spacy_analyzer_result)
-                if len(line_level_ocr_result.text) >= 3:
-                    # Add line to current batch with a separator
-                    if current_batch:
-                        current_batch += " | "  # Use a separator that's unlikely to appear in the text
-                    start_pos = len(current_batch)
-                    current_batch += line_level_ocr_result.text
-                    current_batch_mapping.append((start_pos, i, line_level_ocr_result.text))
-                    # Process batch if it's approaching 300 characters or this is the last line
-                    if len(current_batch) >= 200 or i == len(line_level_ocr_results) - 1:
-                        print("length of text for Comprehend:", len(current_batch))
-                        try:
-                            response = comprehend_client.detect_pii_entities(
-                                Text=current_batch,
-                                LanguageCode=text_analyzer_kwargs["language"]
-                            )
-                        except Exception as e:
-                            print("AWS Comprehend call failed due to:", e, "waiting three seconds to try again.")
-                            time.sleep(3)
-                            response = comprehend_client.detect_pii_entities(
-                                Text=current_batch,
-                                LanguageCode=text_analyzer_kwargs["language"]
-                            )
                         comprehend_query_number += 1
-                        # Map results back to original lines
-                        if response and "Entities" in response:
-                            for entity in response["Entities"]:
-                                entity_start = entity["BeginOffset"]
-                                entity_end = entity["EndOffset"]
-                                # Find which line this entity belongs to
-                                for batch_start, line_idx, original_text in current_batch_mapping:
-                                    batch_end = batch_start + len(original_text)
-                                    # Check if entity belongs to this line
-                                    if batch_start <= entity_start < batch_end:
-                                        # Adjust offsets relative to the original line
-                                        relative_start = entity_start - batch_start
-                                        relative_end = min(entity_end - batch_start, len(original_text))
-                                        result_text = original_text[relative_start:relative_end]
-                                        if result_text not in allow_list:
-                                            if entity.get("Type") in chosen_redact_comprehend_entities:
-                                                # Create a new entity with adjusted positions
-                                                adjusted_entity = entity.copy()
-                                                adjusted_entity["BeginOffset"] = relative_start
-                                                adjusted_entity["EndOffset"] = relative_end
-                                                recogniser_entity = recognizer_result_from_dict(adjusted_entity)
-                                                analyzer_results_by_line[line_idx].append(recogniser_entity)
                         # Reset batch
-                        current_batch = ""
-                        current_batch_mapping = []
-            # Process results for each line
-        for i, analyzer_result in enumerate(analyzer_results_by_line):
-                if i >= len(ocr_results_with_children):
-                    continue
                 child_level_key = list(ocr_results_with_children.keys())[i]
                 ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
-                # Go through results to add bounding boxes
-                for result in analyzer_result:
-                    # Extract the relevant portion of text based on start and end
-                    relevant_text = line_level_ocr_results[i].text[result.start:result.end]
-                    # Find the corresponding entry in ocr_results_with_children
-                    child_words = ocr_results_with_children_line_level['words']
-                     # Initialize bounding box values
-                    left, top, bottom = float('inf'), float('inf'), float('-inf')
-                    all_words = ""
-                    word_num = 0  # Initialize word count
-                    total_width = 0  # Initialize total width
-                    split_relevant_text = relevant_text.split()
-                    loop_child_words = child_words.copy()
-                    for word_text in split_relevant_text:  # Iterate through each word in relevant_text
-                        quote_str = '"'
-                        replace_str = '(?:"|"|")'
-                        word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
-                        for word in loop_child_words:
-                            # Check for regex as whole word
-                            if re.search(word_regex, word['text']):
-                            #if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
-                                found_word = word
-                                if word_num == 0:  # First word
-                                    left = found_word['bounding_box'][0]
-                                    top = found_word['bounding_box'][1]
-                                bottom = max(bottom, found_word['bounding_box'][3])  # Update bottom for all words
-                                all_words += found_word['text'] + " "  # Concatenate words
-                                total_width = found_word['bounding_box'][2] - left  # Add each word's width
-                                word_num += 1
-                                # Drop the first word of child_words
-                                loop_child_words = loop_child_words[1:]  # Skip the first word
-                                break  # Move to the next word in relevant_text
-                    width = total_width + horizontal_buffer # Set width to total width of all matched words
-                    height = bottom - top if word_num > 0 else 0  # Calculate height
-                    relevant_line_ocr_result = OCRResult(
-                        text=relevant_text,
-                        left=left,
-                        top=top - height_buffer,
-                        width=width,
-                        height=height + height_buffer
-                    )
-                    if not ocr_results_with_children_line_level:
-                        # Fallback to previous method if not found in ocr_results_with_children
-                        print("No child info found")
-                        continue
-                    # Reset the word positions indicated in the relevant ocr_result - i.e. it starts from 0 and ends at word length
-                    result_reset_pos = result
-                    result_reset_pos.start = 0
-                    result_reset_pos.end = len(relevant_text)
-                    #print("result_reset_pos:", result_reset_pos)
-                    #print("relevant_line_ocr_result:", relevant_line_ocr_result)
-                    #print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
-                    # Map the analyzer results to bounding boxes for this line
-                    line_results = self.map_analyzer_results_to_bounding_boxes(
-                        [result_reset_pos], [relevant_line_ocr_result], relevant_line_ocr_result.text, allow_list, ocr_results_with_children_line_level
                     )
-                    #print("line_results:", line_results)
-                    combined_results.extend(line_results)
         return combined_results, comprehend_query_number
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
-        text_analyzer_results: List[RecognizerResult],
-        redaction_relevant_ocr_results: List[OCRResult],
-        full_text: str,
-        allow_list: List[str],
-        ocr_results_with_children_child_info: Dict[str, Dict]
-    ) -> List[CustomImageRecognizerResult]:
         redaction_bboxes = []
-        text_position = 0
         for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
-            word_end = text_position + len(redaction_relevant_ocr_result.text)
-            #print("Checking relevant OCR result:", redaction_relevant_ocr_result)
             for redaction_result in text_analyzer_results:
-                max_of_current_text_pos_or_result_start_pos = max(text_position, redaction_result.start)
-                min_of_result_end_pos_or_results_end = min(word_end, redaction_result.end)
-                redaction_result_bounding_box = (redaction_relevant_ocr_result.left, redaction_relevant_ocr_result.top,
-                    redaction_relevant_ocr_result.left + redaction_relevant_ocr_result.width,
-                    redaction_relevant_ocr_result.top + redaction_relevant_ocr_result.height)
-                if (max_of_current_text_pos_or_result_start_pos < min_of_result_end_pos_or_results_end) and (redaction_relevant_ocr_result.text not in allow_list):
-                    #print("result", redaction_result, "made it through if statement")
-                    # Find the corresponding entry in ocr_results_with_children that overlap with the redaction result
-                    child_info = ocr_results_with_children_child_info#.get(full_text)
-                    #print("child_info in sub function:", child_info)
-                    #print("redaction_result_bounding_box:", redaction_result_bounding_box)
-                    #print("Overlaps?", bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']))
-                    if bounding_boxes_overlap(redaction_result_bounding_box, child_info['bounding_box']):
-                        # Use the bounding box from ocr_results_with_children
-                        bbox = redaction_result_bounding_box #child_info['bounding_box']
-                        left, top, right, bottom = bbox
-                        width = right - left
-                        height = bottom - top
-                    else:
-                        print("Could not find OCR result")
-                        continue
-                    redaction_bboxes.append(
-                        CustomImageRecognizerResult(
-                            entity_type=redaction_result.entity_type,
-                            start=redaction_result.start,
-                            end=redaction_result.end,
-                            score=redaction_result.score,
-                            left=left,
-                            top=top,
-                            width=width,
-                            height=height,
-                            text=redaction_relevant_ocr_result.text
                         )
-                    )
-            text_position = word_end + 1  # +1 for the space between words
         return redaction_bboxes
     @staticmethod
     def remove_space_boxes(ocr_result: dict) -> dict:
         """Remove OCR bboxes that are for spaces.
         :param ocr_result: OCR results (raw or thresholded).
         :return: OCR results with empty words removed.
         """
@@ -740,10 +1156,8 @@ class CustomImageAnalyzerEngine:
         ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
     ) -> Dict[str, float]:
         """Scale down the bounding box results based on a scale percentage.
         :param ocr_result: OCR results (raw).
         :param scale_percent: Scale percentage for resizing the bounding box.
         :return: OCR results (scaled).
         """
         scaled_results = deepcopy(ocr_result)
@@ -790,173 +1204,3 @@ class CustomImageAnalyzerEngine:
         estimated_width = int(proportion * ocr_result.width)
         return estimated_width
-    # def estimate_width(self, ocr_result: OCRResult, start: int, end: int) -> int:
-    #     # Extract the relevant text portion
-    #     relevant_text = ocr_result.text[start:end]
-    #     # Check if the relevant text is the entire text of the OCR result
-    #     if relevant_text == ocr_result.text:
-    #         return ocr_result.width
-    #     # Estimate the font size based on the height of the bounding box
-    #     estimated_font_size = ocr_result.height + 4
-    #     # Create a blank image with enough width to measure the text
-    #     dummy_image = Image.new('RGB', (1000, 50), color=(255, 255, 255))
-    #     draw = ImageDraw.Draw(dummy_image)
-    #     # Specify the font and size
-    #     try:
-    #         font = ImageFont.truetype("arial.ttf", estimated_font_size)  # Adjust the font file as needed
-    #     except IOError:
-    #         font = ImageFont.load_default()  # Fallback to default font if the specified font is not found
-    #     # Draw the relevant text on the image
-    #     draw.text((0, 0), relevant_text, fill=(0, 0, 0), font=font)
-    #     # Save the image for debugging purposes
-    #     dummy_image.save("debug_image.png")
-    #     # Use pytesseract to get the bounding box of the relevant text
-    #     bbox = pytesseract.image_to_boxes(dummy_image, config=self.tesseract_config)
-    #     # Print the bbox for debugging
-    #     print("Bounding box:", bbox)
-    #     # Calculate the width from the bounding box
-    #     if bbox:
-    #         try:
-    #             # Initialize min_left and max_right with extreme values
-    #             min_left = float('inf')
-    #             max_right = float('-inf')
-    #             # Split the bbox string into lines
-    #             bbox_lines = bbox.splitlines()
-    #             for line in bbox_lines:
-    #                 parts = line.split()
-    #                 if len(parts) == 6:
-    #                     _, left, _, right, _, _ = parts
-    #                     left = int(left)
-    #                     right = int(right)
-    #                     min_left = min(min_left, left)
-    #                     max_right = max(max_right, right)
-    #             width = max_right - min_left
-    #         except ValueError as e:
-    #             print("Error parsing bounding box:", e)
-    #             width = 0
-    #     else:
-    #         width = 0
-    #     print("Estimated width:", width)
-    #     return width
-# Function to combine OCR results into line-level results
-def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
-    # Group OCR results into lines based on y_threshold
-    lines = []
-    current_line = []
-    for result in sorted(ocr_results, key=lambda x: x.top):
-        if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
-            current_line.append(result)
-        else:
-            lines.append(current_line)
-            current_line = [result]
-    if current_line:
-        lines.append(current_line)
-    # Sort each line by left position
-    for line in lines:
-        line.sort(key=lambda x: x.left)
-    # Flatten the sorted lines back into a single list
-    sorted_results = [result for line in lines for result in line]
-    combined_results = []
-    new_format_results = {}
-    current_line = []
-    current_bbox = None
-    line_counter = 1
-    def create_ocr_result_with_children(combined_results, i, current_bbox, current_line):
-        combined_results["text_line_" + str(i)] = {
-        "line": i,
-        'text': current_bbox.text,
-        'bounding_box': (current_bbox.left, current_bbox.top,
-                            current_bbox.left + current_bbox.width,
-                            current_bbox.top + current_bbox.height),
-        'words': [{'text': word.text,
-                    'bounding_box': (word.left, word.top,
-                                    word.left + word.width,
-                                    word.top + word.height)}
-                    for word in current_line]
-    }
-        return combined_results["text_line_" + str(i)]
-    for result in sorted_results:
-        if not current_line:
-            # Start a new line
-            current_line.append(result)
-            current_bbox = result
-        else:
-            # Check if the result is on the same line (y-axis) and close horizontally (x-axis)
-            last_result = current_line[-1]
-            if abs(result.top - last_result.top) <= y_threshold and \
-               (result.left - (last_result.left + last_result.width)) <= x_threshold:
-                # Update the bounding box to include the new word
-                new_right = max(current_bbox.left + current_bbox.width, result.left + result.width)
-                current_bbox = OCRResult(
-                    text=f"{current_bbox.text} {result.text}",
-                    left=current_bbox.left,
-                    top=current_bbox.top,
-                    width=new_right - current_bbox.left,
-                    height=max(current_bbox.height, result.height)
-                )
-                current_line.append(result)
-            else:
-                # Commit the current line and start a new one
-                combined_results.append(current_bbox)
-                # new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
-                #     'bounding_box': (current_bbox.left, current_bbox.top,
-                #                      current_bbox.left + current_bbox.width,
-                #                      current_bbox.top + current_bbox.height),
-                #     'words': [{'text': word.text,
-                #                'bounding_box': (word.left, word.top,
-                #                                 word.left + word.width,
-                #                                 word.top + word.height)}
-                #               for word in current_line]
-                # }
-                new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
-                line_counter += 1
-                current_line = [result]
-                current_bbox = result
-    # Append the last line
-    if current_bbox:
-        combined_results.append(current_bbox)
-        # new_format_results[current_bbox.text] = { # f"combined_text_{line_counter}"
-        #     'bounding_box': (current_bbox.left, current_bbox.top,
-        #                      current_bbox.left + current_bbox.width,
-        #                      current_bbox.top + current_bbox.height),
-        #     'words': [{'text': word.text,
-        #                'bounding_box': (word.left, word.top,
-        #                                 word.left + word.width,
-        #                                 word.top + word.height)}
-        #               for word in current_line]
-        # }
-        new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
-    return combined_results, new_format_results

 import pytesseract
 import numpy as np
 from presidio_analyzer import AnalyzerEngine, RecognizerResult
 from typing import List, Dict, Optional, Union, Tuple
 from dataclasses import dataclass
 import time
 import cv2
+import copy
+from copy import deepcopy
+from pdfminer.layout import LTChar
 import PIL
+from PIL import Image
 from typing import Optional, Tuple, Union
 from tools.helper_functions import clean_unicode_text
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.load_spacy_model_custom_recognisers import custom_entities
 @dataclass
 class OCRResult:
         return Image.fromarray(filtered_image), metadata
 class SegmentedAdaptiveThreshold(ImagePreprocessor):
     """SegmentedAdaptiveThreshold class.
         metadata = {"C": c, "background_color": background_color, "contrast": contrast}
         return Image.fromarray(adaptive_threshold_image), metadata
 class ImageRescaling(ImagePreprocessor):
     """ImageRescaling class. Rescales images based on their size."""
         metadata = {"scale_factor": scale_factor}
         return Image.fromarray(rescaled_image), metadata
 class ContrastSegmentedImageEnhancer(ImagePreprocessor):
     """Class containing all logic to perform contrastive segmentation.
     """Check if two bounding boxes overlap."""
     return (box1[0] < box2[2] and box2[0] < box1[2] and
             box1[1] < box2[3] and box2[1] < box1[3])
+def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
+    for entity in page_analyser_result:
+        entity_start = entity.start
+        entity_end = entity.end
+        # Track if the entity has been added to any line
+        added_to_line = False
+        for batch_start, line_idx, original_line, chars in page_text_mapping:
+            batch_end = batch_start + len(original_line.text)
+            # Check if the entity overlaps with the current line
+            if batch_start < entity_end and batch_end > entity_start:  # Overlap condition
+                relative_start = max(0, entity_start - batch_start)  # Adjust start relative to the line
+                relative_end = min(entity_end - batch_start, len(original_line.text))  # Adjust end relative to the line
+                # Create a new adjusted entity
+                adjusted_entity = copy.deepcopy(entity)
+                adjusted_entity.start = relative_start
+                adjusted_entity.end = relative_end
+                # Check if this line already has an entry
+                existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
+                if existing_entry is None:
+                    all_text_line_results.append((line_idx, [adjusted_entity]))
+                else:
+                    existing_entry.append(adjusted_entity)  # Append to the existing list of entities
+                added_to_line = True
+        # If the entity spans multiple lines, you may want to handle that here
+        if not added_to_line:
+            # Handle cases where the entity does not fit in any line (optional)
+            print(f"Entity '{entity}' does not fit in any line.")
+    return all_text_line_results
+def map_back_comprehend_entity_results(response, current_batch_mapping, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
+    if not response or "Entities" not in response:
+        return all_text_line_results
+    for entity in response["Entities"]:
+        if entity.get("Type") not in chosen_redact_comprehend_entities:
+            continue
+        entity_start = entity["BeginOffset"]
+        entity_end = entity["EndOffset"]
+        # Track if the entity has been added to any line
+        added_to_line = False
+        # Find the correct line and offset within that line
+        for batch_start, line_idx, original_line, chars, line_offset in current_batch_mapping:
+            batch_end = batch_start + len(original_line.text[line_offset:])
+            # Check if the entity overlaps with the current line
+            if batch_start < entity_end and batch_end > entity_start:  # Overlap condition
+                # Calculate the absolute position within the line
+                relative_start = max(0, entity_start - batch_start + line_offset)
+                relative_end = min(entity_end - batch_start + line_offset, len(original_line.text))
+                result_text = original_line.text[relative_start:relative_end]
+                if result_text not in allow_list:
+                    adjusted_entity = entity.copy()
+                    adjusted_entity["BeginOffset"] = relative_start  # Now relative to the full line
+                    adjusted_entity["EndOffset"] = relative_end
+                    recogniser_entity = recognizer_result_from_dict(adjusted_entity)
+                    existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
+                    if existing_entry is None:
+                        all_text_line_results.append((line_idx, [recogniser_entity]))
+                    else:
+                        existing_entry.append(recogniser_entity)  # Append to the existing list of entities
+                added_to_line = True
+        # Optional: Handle cases where the entity does not fit in any line
+        if not added_to_line:
+            print(f"Entity '{entity}' does not fit in any line.")
+    return all_text_line_results
+def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_client, language, allow_list, chosen_redact_comprehend_entities, all_text_line_results):
+    if not current_batch:
+        return all_text_line_results
+    max_retries = 3
+    retry_delay = 3
+    for attempt in range(max_retries):
+        try:
+            response = comprehend_client.detect_pii_entities(
+                Text=current_batch.strip(),
+                LanguageCode=language
+            )
+            all_text_line_results = map_back_comprehend_entity_results(
+                response,
+                current_batch_mapping,
+                allow_list,
+                chosen_redact_comprehend_entities,
+                all_text_line_results
+            )
+            return all_text_line_results
+        except Exception as e:
+            if attempt == max_retries - 1:
+                raise
+            time.sleep(retry_delay)
+def run_page_text_redaction(
+    language: str,
+    chosen_redact_entities: List[str],
+    chosen_redact_comprehend_entities: List[str],
+    line_level_text_results_list: List[str],
+    line_characters: List,
+    page_analyser_results: List = [],
+    page_analysed_bounding_boxes: List = [],
+    comprehend_client = None,
+    allow_list: List[str] = None,
+    pii_identification_method: str = "Local",
+    nlp_analyser = None,
+    score_threshold: float = 0.0,
+    custom_entities: List[str] = None,
+    comprehend_query_number:int = 0#,
+    #merge_text_bounding_boxes_fn = merge_text_bounding_boxes
+):
+    #if not merge_text_bounding_boxes_fn:
+    #    raise ValueError("merge_text_bounding_boxes_fn is required")
+    page_text = ""
+    page_text_mapping = []
+    all_text_line_results = []
+    comprehend_query_number = 0
+    # Collect all text from the page
+    for i, text_line in enumerate(line_level_text_results_list):
+        #print("line_level_text_results_list:", line_level_text_results_list)
+        if chosen_redact_entities:
+            if page_text:
+                #page_text += " | "
+                page_text += " "
+            start_pos = len(page_text)
+            page_text += text_line.text
+            page_text_mapping.append((start_pos, i, text_line, line_characters[i]))
+    # Process based on identification method
+    if pii_identification_method == "Local":
+        if not nlp_analyser:
+            raise ValueError("nlp_analyser is required for Local identification method")
+        print("page text:", page_text)
+        page_analyser_result = nlp_analyser.analyze(
+            text=page_text,
+            language=language,
+            entities=chosen_redact_entities,
+            score_threshold=score_threshold,
+            return_decision_process=True,
+            allow_list=allow_list
+        )
+        #print("page_analyser_result:", page_analyser_result)
+        all_text_line_results = map_back_entity_results(
+            page_analyser_result,
+            page_text_mapping,
+            all_text_line_results
+        )
+        #print("all_text_line_results:", all_text_line_results)
+    elif pii_identification_method == "AWS Comprehend":
+        #print("page text:", page_text)
+        # Process custom entities if any
+        if custom_entities:
+            custom_redact_entities = [
+                entity for entity in chosen_redact_comprehend_entities
+                if entity in custom_entities
+            ]
+            if custom_redact_entities:
+                page_analyser_result = nlp_analyser.analyze(
+                    text=page_text,
+                    language=language,
+                    entities=custom_redact_entities,
+                    score_threshold=score_threshold,
+                    return_decision_process=True,
+                    allow_list=allow_list
+                )
+                print("page_analyser_result:", page_analyser_result)
+                all_text_line_results = map_back_entity_results(
+                    page_analyser_result,
+                    page_text_mapping,
+                    all_text_line_results
+                )
+        current_batch = ""
+        current_batch_mapping = []
+        batch_char_count = 0
+        batch_word_count = 0
+        for i, text_line in enumerate(line_level_text_results_list):
+            words = text_line.text.split()
+            word_start_positions = []
+            # Calculate word start positions within the line
+            current_pos = 0
+            for word in words:
+                word_start_positions.append(current_pos)
+                current_pos += len(word) + 1  # +1 for space
+            for word_idx, word in enumerate(words):
+                new_batch_char_count = len(current_batch) + len(word) + 1
+                if batch_word_count >= 50 or new_batch_char_count >= 200:
+                    # Process current batch
+                    all_text_line_results = do_aws_comprehend_call(
+                        current_batch,
+                        current_batch_mapping,
+                        comprehend_client,
+                        language,
+                        allow_list,
+                        chosen_redact_comprehend_entities,
+                        all_text_line_results
+                    )
+                    comprehend_query_number += 1
+                    # Start new batch
+                    current_batch = word
+                    batch_word_count = 1
+                    batch_char_count = len(word)
+                    current_batch_mapping = [(0, i, text_line, line_characters[i], word_start_positions[word_idx])]
+                else:
+                    if current_batch:
+                        current_batch += " "
+                        batch_char_count += 1
+                    current_batch += word
+                    batch_char_count += len(word)
+                    batch_word_count += 1
+                    if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+                        current_batch_mapping.append((
+                            batch_char_count - len(word),
+                            i,
+                            text_line,
+                            line_characters[i],
+                            word_start_positions[word_idx]  # Add the word's start position within its line
+                        ))
+        # Process final batch
+        if current_batch:
+            all_text_line_results = do_aws_comprehend_call(
+                current_batch,
+                current_batch_mapping,
+                comprehend_client,
+                language,
+                allow_list,
+                chosen_redact_comprehend_entities,
+                all_text_line_results
+            )
+            comprehend_query_number += 1
+    # Process results for each line
+    for i, text_line in enumerate(line_level_text_results_list):
+        line_results = next((results for idx, results in all_text_line_results if idx == i), [])
+        if line_results:
+            text_line_bounding_boxes = merge_text_bounding_boxes(
+                line_results,
+                line_characters[i]
+            )
+            page_analyser_results.extend(line_results)
+            page_analysed_bounding_boxes.extend(text_line_bounding_boxes)
+    return page_analysed_bounding_boxes
+def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
+    '''
+    Merge identified bounding boxes containing PII that are very close to one another
+    '''
+    analysed_bounding_boxes = []
+    original_bounding_boxes = []  # List to hold original bounding boxes
+    if len(analyser_results) > 0 and len(characters) > 0:
+        # Extract bounding box coordinates for sorting
+        bounding_boxes = []
+        for result in analyser_results:
+            #print("Result:", result)
+            char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
+            char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
+            if char_boxes:
+                # Calculate the bounding box that encompasses all characters
+                left = min(box[0] for box in char_boxes)
+                bottom = min(box[1] for box in char_boxes)
+                right = max(box[2] for box in char_boxes)
+                top = max(box[3] for box in char_boxes) + vertical_padding
+                bbox = [left, bottom, right, top]
+                bounding_boxes.append((bottom, left, result, bbox, char_text))  # (y, x, result, bbox, text)
+                # Store original bounding boxes
+                original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
+                #print("Original bounding boxes:", original_bounding_boxes)
+        # Sort the results by y-coordinate and then by x-coordinate
+        bounding_boxes.sort()
+        merged_bounding_boxes = []
+        current_box = None
+        current_y = None
+        current_result = None
+        current_text = []
+        for y, x, result, next_box, text in bounding_boxes:
+            if current_y is None or current_box is None:
+                # Initialize the first bounding box
+                current_box = next_box
+                current_y = next_box[1]
+                current_result = result
+                current_text = list(text)
+            else:
+                vertical_diff_bboxes = abs(next_box[1] - current_y)
+                horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
+                if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
+                    # Merge bounding boxes
+                    #print("Merging boxes")
+                    merged_box = current_box.copy()
+                    merged_result = current_result
+                    merged_text = current_text.copy()
+                    merged_box[2] = next_box[2]  # Extend horizontally
+                    merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
+                    merged_result.end = max(current_result.end, result.end)  # Extend text range
+                    try:
+                        if current_result.entity_type != result.entity_type:
+                            merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
+                        else:
+                            merged_result.entity_type = current_result.entity_type
+                    except Exception as e:
+                        print("Unable to combine result entity types:", e)
+                    if current_text:
+                        merged_text.append(" ")  # Add space between texts
+                    merged_text.extend(text)
+                    merged_bounding_boxes.append({
+                        "text": "".join(merged_text),
+                        "boundingBox": merged_box,
+                        "result": merged_result
+                    })
+                else:
+                    # Start a new bounding box
+                    current_box = next_box
+                    current_y = next_box[1]
+                    current_result = result
+                    current_text = list(text)
+        # Combine original and merged bounding boxes
+        analysed_bounding_boxes.extend(original_bounding_boxes)
+        analysed_bounding_boxes.extend(merged_bounding_boxes)
+        #print("Analysed bounding boxes:", analysed_bounding_boxes)
+    return analysed_bounding_boxes
+# Function to combine OCR results into line-level results
+def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
+    # Group OCR results into lines based on y_threshold
+    lines = []
+    current_line = []
+    for result in sorted(ocr_results, key=lambda x: x.top):
+        if not current_line or abs(result.top - current_line[0].top) <= y_threshold:
+            current_line.append(result)
+        else:
+            lines.append(current_line)
+            current_line = [result]
+    if current_line:
+        lines.append(current_line)
+    # Sort each line by left position
+    for line in lines:
+        line.sort(key=lambda x: x.left)
+    # Flatten the sorted lines back into a single list
+    sorted_results = [result for line in lines for result in line]
+    combined_results = []
+    new_format_results = {}
+    current_line = []
+    current_bbox = None
+    line_counter = 1
+    def create_ocr_result_with_children(combined_results, i, current_bbox, current_line):
+        combined_results["text_line_" + str(i)] = {
+        "line": i,
+        'text': current_bbox.text,
+        'bounding_box': (current_bbox.left, current_bbox.top,
+                            current_bbox.left + current_bbox.width,
+                            current_bbox.top + current_bbox.height),
+        'words': [{'text': word.text,
+                    'bounding_box': (word.left, word.top,
+                                    word.left + word.width,
+                                    word.top + word.height)}
+                    for word in current_line]
+    }
+        return combined_results["text_line_" + str(i)]
+    for result in sorted_results:
+        if not current_line:
+            # Start a new line
+            current_line.append(result)
+            current_bbox = result
+        else:
+            # Check if the result is on the same line (y-axis) and close horizontally (x-axis)
+            last_result = current_line[-1]
+            if abs(result.top - last_result.top) <= y_threshold and \
+               (result.left - (last_result.left + last_result.width)) <= x_threshold:
+                # Update the bounding box to include the new word
+                new_right = max(current_bbox.left + current_bbox.width, result.left + result.width)
+                current_bbox = OCRResult(
+                    text=f"{current_bbox.text} {result.text}",
+                    left=current_bbox.left,
+                    top=current_bbox.top,
+                    width=new_right - current_bbox.left,
+                    height=max(current_bbox.height, result.height)
+                )
+                current_line.append(result)
+            else:
+                # Commit the current line and start a new one
+                combined_results.append(current_bbox)
+                new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
+                line_counter += 1
+                current_line = [result]
+                current_bbox = result
+    # Append the last line
+    if current_bbox:
+        combined_results.append(current_bbox)
+        new_format_results["text_line_" + str(line_counter)] = create_ocr_result_with_children(new_format_results, line_counter, current_bbox, current_line)
+    return combined_results, new_format_results
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
         line_level_ocr_results: List[OCRResult],
         ocr_results_with_children: Dict[str, Dict],
+        chosen_redact_comprehend_entities: List[str],
+        pii_identification_method: str = "Local",
+        comprehend_client = "",
         **text_analyzer_kwargs
     ) -> List[CustomImageRecognizerResult]:
+        page_text = ""
+        page_text_mapping = []
+        all_text_line_results = []
         comprehend_query_number = 0
+        # Collect all text and create mapping
         for i, line_level_ocr_result in enumerate(line_level_ocr_results):
+            if page_text:
+                page_text += " "
+            start_pos = len(page_text)
+            page_text += line_level_ocr_result.text
+            # Note: We're not passing line_characters here since it's not needed for this use case
+            page_text_mapping.append((start_pos, i, line_level_ocr_result, None))
+        # Process using either Local or AWS Comprehend
+        if pii_identification_method == "Local":
+            analyzer_result = self.analyzer_engine.analyze(
+                text=page_text,
+                **text_analyzer_kwargs
+            )
+            all_text_line_results = map_back_entity_results(
+                analyzer_result,
+                page_text_mapping,
+                all_text_line_results
+            )
+        elif pii_identification_method == "AWS Comprehend":
+            # Handle custom entities first
+            if custom_entities:
+                custom_redact_entities = [
+                    entity for entity in chosen_redact_comprehend_entities
+                    if entity in custom_entities
+                ]
+                if custom_redact_entities:
+                    text_analyzer_kwargs["entities"] = custom_redact_entities
+                    page_analyser_result = self.analyzer_engine.analyze(
+                        text=page_text,
+                        **text_analyzer_kwargs
+                    )
+                    all_text_line_results = map_back_entity_results(
+                        page_analyser_result,
+                        page_text_mapping,
+                        all_text_line_results
+                    )
+            # Process text in batches for AWS Comprehend
+            current_batch = ""
+            current_batch_mapping = []
+            batch_char_count = 0
+            batch_word_count = 0
+            for i, text_line in enumerate(line_level_ocr_results):
+                words = text_line.text.split()
+                word_start_positions = []
+                current_pos = 0
+                for word in words:
+                    word_start_positions.append(current_pos)
+                    current_pos += len(word) + 1
+                for word_idx, word in enumerate(words):
+                    new_batch_char_count = len(current_batch) + len(word) + 1
+                    if batch_word_count >= 50 or new_batch_char_count >= 200:
+                        # Process current batch
+                        all_text_line_results = do_aws_comprehend_call(
+                            current_batch,
+                            current_batch_mapping,
+                            comprehend_client,
+                            text_analyzer_kwargs["language"],
+                            text_analyzer_kwargs.get('allow_list', []),
+                            chosen_redact_comprehend_entities,
+                            all_text_line_results
+                        )
                         comprehend_query_number += 1
                         # Reset batch
+                        current_batch = word
+                        batch_word_count = 1
+                        batch_char_count = len(word)
+                        current_batch_mapping = [(0, i, text_line, None, word_start_positions[word_idx])]
+                    else:
+                        if current_batch:
+                            current_batch += " "
+                            batch_char_count += 1
+                        current_batch += word
+                        batch_char_count += len(word)
+                        batch_word_count += 1
+                        if not current_batch_mapping or current_batch_mapping[-1][1] != i:
+                            current_batch_mapping.append((
+                                batch_char_count - len(word),
+                                i,
+                                text_line,
+                                None,
+                                word_start_positions[word_idx]
+                            ))
+            # Process final batch if any
+            if current_batch:
+                all_text_line_results = do_aws_comprehend_call(
+                    current_batch,
+                    current_batch_mapping,
+                    comprehend_client,
+                    text_analyzer_kwargs["language"],
+                    text_analyzer_kwargs.get('allow_list', []),
+                    chosen_redact_comprehend_entities,
+                    all_text_line_results
+                )
+                comprehend_query_number += 1
+        # Process results and create bounding boxes
+        combined_results = []
+        for i, text_line in enumerate(line_level_ocr_results):
+            line_results = next((results for idx, results in all_text_line_results if idx == i), [])
+            if line_results and i < len(ocr_results_with_children):
                 child_level_key = list(ocr_results_with_children.keys())[i]
                 ocr_results_with_children_line_level = ocr_results_with_children[child_level_key]
+                for result in line_results:
+                    bbox_results = self.map_analyzer_results_to_bounding_boxes(
+                        [result],
+                        [OCRResult(
+                            text=text_line.text[result.start:result.end],
+                            left=text_line.left,
+                            top=text_line.top,
+                            width=text_line.width,
+                            height=text_line.height
+                        )],
+                        text_line.text,
+                        text_analyzer_kwargs.get('allow_list', []),
+                        ocr_results_with_children_line_level
                     )
+                    combined_results.extend(bbox_results)
         return combined_results, comprehend_query_number
     @staticmethod
     def map_analyzer_results_to_bounding_boxes(
+    text_analyzer_results: List[RecognizerResult],
+    redaction_relevant_ocr_results: List[OCRResult],
+    full_text: str,
+    allow_list: List[str],
+    ocr_results_with_children_child_info: Dict[str, Dict]
+) -> List[CustomImageRecognizerResult]:
         redaction_bboxes = []
         for redaction_relevant_ocr_result in redaction_relevant_ocr_results:
+            #print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
+            line_text = ocr_results_with_children_child_info['text']
+            line_length = len(line_text)
+            redaction_text = redaction_relevant_ocr_result.text
+            # print(f"Processing line: '{line_text}'")
             for redaction_result in text_analyzer_results:
+                # print(f"Checking redaction result: {redaction_result}")
+                # print("redaction_text:", redaction_text)
+                # print("line_length:", line_length)
+                # print("line_text:", line_text)
+                # Check if the redaction text is no in the allow list
+                if redaction_text not in allow_list:
+                    # Adjust start and end to be within line bounds
+                    start_in_line = max(0, redaction_result.start)
+                    end_in_line = min(line_length, redaction_result.end)
+                    # Get the matched text from this line
+                    matched_text = line_text[start_in_line:end_in_line]
+                    matched_words = matched_text.split()
+                    # print(f"Found match: '{matched_text}' in line")
+                    # Find the corresponding words in the OCR results
+                    matching_word_boxes = []
+                    for word_info in ocr_results_with_children_child_info.get('words', []):
+                        # Check if this word is part of our match
+                        if any(word.lower() in word_info['text'].lower() for word in matched_words):
+                            matching_word_boxes.append(word_info['bounding_box'])
+                            # print(f"Matched word: {word_info['text']}")
+                    if matching_word_boxes:
+                        # Calculate the combined bounding box for all matching words
+                        left = min(box[0] for box in matching_word_boxes)
+                        top = min(box[1] for box in matching_word_boxes)
+                        right = max(box[2] for box in matching_word_boxes)
+                        bottom = max(box[3] for box in matching_word_boxes)
+                        redaction_bboxes.append(
+                            CustomImageRecognizerResult(
+                                entity_type=redaction_result.entity_type,
+                                start=start_in_line,
+                                end=end_in_line,
+                                score=redaction_result.score,
+                                left=left,
+                                top=top,
+                                width=right - left,
+                                height=bottom - top,
+                                text=matched_text
+                            )
                         )
+                        # print(f"Added bounding box for: '{matched_text}'")
         return redaction_bboxes
     @staticmethod
     def remove_space_boxes(ocr_result: dict) -> dict:
         """Remove OCR bboxes that are for spaces.
         :param ocr_result: OCR results (raw or thresholded).
         :return: OCR results with empty words removed.
         """
         ocr_result: Dict[str, List[Union[int, str]]], scale_factor: float
     ) -> Dict[str, float]:
         """Scale down the bounding box results based on a scale percentage.
         :param ocr_result: OCR results (raw).
         :param scale_percent: Scale percentage for resizing the bounding box.
         :return: OCR results (scaled).
         """
         scaled_results = deepcopy(ocr_result)
         estimated_width = int(proportion * ocr_result.width)
         return estimated_width

tools/file_conversion.py CHANGED Viewed

@@ -201,7 +201,7 @@ def process_file(file_path:str, prepare_for_review:bool=False):
     if file_extension in ['.jpg', '.jpeg', '.png']:
         print(f"{file_path} is an image file.")
         # Perform image processing here
-        img_object = [Image.open(file_path)]
         # Load images from the file paths
     # Check if the file is a PDF
@@ -490,6 +490,7 @@ def prepare_image_or_pdf(
         else:
             file_path = file.name
         file_path_without_ext = get_file_path_end(file_path)
         if not file_path:
             out_message = "Please select a file."
@@ -532,8 +533,13 @@ def prepare_image_or_pdf(
             image_file_paths = process_file(file_path_str, prepare_for_review)
-            print("Inserted image into PDF file")
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
@@ -738,6 +744,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
             reported_number = int(number) + 1
         else:
             print("No number found before .png")
         # Check if 'boxes' is in the annotation, if not, add an empty list
         if 'boxes' not in annotation:

     if file_extension in ['.jpg', '.jpeg', '.png']:
         print(f"{file_path} is an image file.")
         # Perform image processing here
+        img_object = [file_path] #[Image.open(file_path)]
         # Load images from the file paths
     # Check if the file is a PDF
         else:
             file_path = file.name
         file_path_without_ext = get_file_path_end(file_path)
+        file_name_with_ext = os.path.basename(file_path)
         if not file_path:
             out_message = "Please select a file."
             image_file_paths = process_file(file_path_str, prepare_for_review)
+            #print("image_file_paths:", image_file_paths)
+            converted_file_path = output_folder + file_name_with_ext
+            pymupdf_doc.save(converted_file_path)
+            print("Inserted image into PDF file")
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
             reported_number = int(number) + 1
         else:
             print("No number found before .png")
+            reported_number = 1
         # Check if 'boxes' is in the annotation, if not, add an empty list
         if 'boxes' not in annotation:

tools/file_redaction.py CHANGED Viewed

@@ -25,13 +25,13 @@ from collections import defaultdict  # For efficient grouping
 from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
-from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
-from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
 page_break_value = get_or_create_env_var('page_break_value', '50000')
@@ -136,6 +136,9 @@ def choose_and_run_redactor(file_paths:List[str],
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
         custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
@@ -159,7 +162,6 @@ def choose_and_run_redactor(file_paths:List[str],
     elif (first_loop_state == False) & (current_loop_page == 999):
         current_loop_page = 0
     if not out_file_paths:
         out_file_paths = []
@@ -184,21 +186,33 @@ def choose_and_run_redactor(file_paths:List[str],
             combined_out_message = '\n'.join(out_message)
         else:
             combined_out_message = out_message
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
     # If we have reached the last page, return message
     if current_loop_page >= number_of_pages:
-        print("current_loop_page:", current_loop_page, "is equal to or greater than number of pages in document:", number_of_pages)
         # Set to a very high number so as not to mix up with subsequent file processing by the user
         current_loop_page = 999
         combined_out_message = out_message
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
     # Create allow list
     # If string, assume file path
@@ -221,7 +235,7 @@ def choose_and_run_redactor(file_paths:List[str],
             comprehend_client = ""
             out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
     else:
         comprehend_client = ""
@@ -233,7 +247,7 @@ def choose_and_run_redactor(file_paths:List[str],
             textract_client = ""
             out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
     else:
         textract_client = ""
@@ -265,8 +279,9 @@ def choose_and_run_redactor(file_paths:List[str],
             file_path = file.name
         if file_path:
-            file_path_without_ext = get_file_path_end(file_path)
-            print("Redacting file:", file_path_without_ext)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -277,16 +292,16 @@ def choose_and_run_redactor(file_paths:List[str],
             out_message = "No file selected"
             print(out_message)
-            return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
-            print("Redacting file " + file_path_without_ext + " as an image-based file")
             pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
@@ -328,7 +343,7 @@ def choose_and_run_redactor(file_paths:List[str],
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
-                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
@@ -356,12 +371,12 @@ def choose_and_run_redactor(file_paths:List[str],
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
         # If at last page, save to file
         if current_loop_page >= number_of_pages:
-            print("Current page loop:", current_loop_page, "is greater or equal to number of pages:", number_of_pages)
             latest_file_completed += 1
             current_loop_page = 999
@@ -370,36 +385,43 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save file
             if is_pdf(file_path) == False:
-                out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_pdf.pdf"
-                pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
             else:
-                out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
-                pymupdf_doc.save(out_image_file_path)
-            out_file_paths.append(out_image_file_path)
             #if log_files_output_paths:
             #    log_files_output_paths.extend(log_files_output_paths)
-            logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
             log_files_output_paths.append(logs_output_file_name)
-            all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
             out_file_paths.append(all_text_output_file_name)
             # Save the gradio_annotation_boxes to a JSON file
             try:
-                print("Saving annotations to JSON")
-                out_annotation_file_path = out_image_file_path + '_review_file.json'
                 with open(out_annotation_file_path, 'w') as f:
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
-                print("Saving annotations to CSV")
                 # Convert json to csv and also save this
                 #print("annotations_all_pages:", annotations_all_pages)
@@ -407,14 +429,14 @@ def choose_and_run_redactor(file_paths:List[str],
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
-                out_review_file_file_path = out_image_file_path + '_review_file.csv'
-                review_df.to_csv(out_review_file_file_path, index=None)
-                out_file_paths.append(out_review_file_file_path)
                 print("Saved review file to csv")
             except Exception as e:
-                print("Could not save annotations to json file:", e)
             # Make a combined message for the file
             if isinstance(out_message, list):
@@ -429,7 +451,7 @@ def choose_and_run_redactor(file_paths:List[str],
             combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
             estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
-            print("Estimated total processing time:", str(estimate_total_processing_time))
         else:
             toc = time.perf_counter()
@@ -441,7 +463,7 @@ def choose_and_run_redactor(file_paths:List[str],
     if all_request_metadata:
         all_request_metadata_str = '\n'.join(all_request_metadata).strip()
-        all_request_metadata_file_path = output_folder + file_path_without_ext + "_textract_request_metadata.txt"
         with open(all_request_metadata_file_path, "w") as f:
             f.write(all_request_metadata_str)
@@ -456,10 +478,15 @@ def choose_and_run_redactor(file_paths:List[str],
     # Ensure no duplicated output files
     log_files_output_paths = list(set(log_files_output_paths))
-    out_file_paths = list(set(out_file_paths))
-    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''
@@ -930,14 +957,7 @@ def redact_image_pdf(file_path:str,
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         #print("new_custom_recogniser:", new_custom_recogniser)
-        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
-        # List all elements currently in the nlp_analyser registry
-        #print("Current recognizers in nlp_analyser registry:")
-        for recognizer_name in nlp_analyser.registry.recognizers:
-            print(recognizer_name)
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
@@ -1031,7 +1051,7 @@ def redact_image_pdf(file_path:str,
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
-                #print("image is a file path")
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
@@ -1137,7 +1157,7 @@ def redact_image_pdf(file_path:str,
                 all_image_annotations_boxes = []
                 for box in merged_redaction_bboxes:
-                    print("box:", box)
                     x0 = box.left
                     y0 = box.top
@@ -1299,6 +1319,8 @@ def get_text_container_characters(text_container:LTTextContainer):
                     for line in text_container
                     if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
                     for char in line]
         return characters
     return []
@@ -1312,6 +1334,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
     line_level_characters_out = []
     #all_line_level_characters_out = []
     character_objects_out = []  # New list to store character objects
     # Initialize variables
     full_text = ""
@@ -1326,12 +1349,19 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
     for char in char_objects:
         character_objects_out.append(char)  # Collect character objects
         if isinstance(char, LTAnno):
             added_text = char.get_text()
             # Handle double quotes
-            added_text = added_text.replace('"', '\\"')  # Escape double quotes
             # Handle space separately by finalizing the word
             full_text += added_text  # Adds space or newline
@@ -1348,7 +1378,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
                 if current_word:
                     word_bboxes.append((current_word, current_word_bbox))
                 # Create an OCRResult for the current line
-                line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
                 line_level_characters_out.append(character_objects_out)
                 # Reset for the next line
                 character_objects_out = []
@@ -1396,119 +1426,15 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
             # Convert special characters to a human-readable format
             #full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
             full_text = clean_unicode_text(full_text)
         #print("full_text:", full_text)
-        line_level_results_out.append(OCRResult(full_text, round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
     #line_level_characters_out = character_objects_out
     return line_level_results_out, line_level_characters_out  # Return both results and character objects
-def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
-    '''
-    Merge identified bounding boxes containing PII that are very close to one another
-    '''
-    analysed_bounding_boxes = []
-    original_bounding_boxes = []  # List to hold original bounding boxes
-    if len(analyser_results) > 0 and len(characters) > 0:
-        # Extract bounding box coordinates for sorting
-        bounding_boxes = []
-        for result in analyser_results:
-            #print("Result:", result)
-            char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
-            char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
-            if char_boxes:
-                # Calculate the bounding box that encompasses all characters
-                left = min(box[0] for box in char_boxes)
-                bottom = min(box[1] for box in char_boxes)
-                right = max(box[2] for box in char_boxes)
-                top = max(box[3] for box in char_boxes) + vertical_padding
-                bbox = [left, bottom, right, top]
-                bounding_boxes.append((bottom, left, result, bbox, char_text))  # (y, x, result, bbox, text)
-                # Store original bounding boxes
-                original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
-                #print("Original bounding boxes:", original_bounding_boxes)
-        # Sort the results by y-coordinate and then by x-coordinate
-        bounding_boxes.sort()
-        merged_bounding_boxes = []
-        current_box = None
-        current_y = None
-        current_result = None
-        current_text = []
-        for y, x, result, next_box, text in bounding_boxes:
-            if current_y is None or current_box is None:
-                # Initialize the first bounding box
-                current_box = next_box
-                current_y = next_box[1]
-                current_result = result
-                current_text = list(text)
-            else:
-                vertical_diff_bboxes = abs(next_box[1] - current_y)
-                horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
-                if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
-                    # Merge bounding boxes
-                    #print("Merging boxes")
-                    merged_box = current_box.copy()
-                    merged_result = current_result
-                    merged_text = current_text.copy()
-                    #print("current_box_max_x:", current_box[2])
-                    #print("char_max_x:", next_box[2])
-                    merged_box[2] = next_box[2]  # Extend horizontally
-                    merged_box[3] = max(current_box[3], next_box[3])  # Adjust the top
-                    merged_result.end = max(current_result.end, result.end)  # Extend text range
-                    try:
-                        if current_result.entity_type != result.entity_type:
-                            merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
-                        else:
-                            merged_result.entity_type = current_result.entity_type
-                    except Exception as e:
-                        print("Unable to combine result entity types:", e)
-                    if current_text:
-                        merged_text.append(" ")  # Add space between texts
-                    merged_text.extend(text)
-                    merged_bounding_boxes.append({
-                        "text": "".join(merged_text),
-                        "boundingBox": merged_box,
-                        "result": merged_result
-                    })
-                else:
-                    # Save the current merged box before starting a new one
-                    # merged_bounding_boxes.append({
-                    #     "text": "".join(current_text),
-                    #     "boundingBox": current_box,
-                    #     "result": current_result
-                    # })
-                    # Start a new bounding box
-                    current_box = next_box
-                    current_y = next_box[1]
-                    current_result = result
-                    current_text = list(text)
-        # Handle the last box
-        # if current_box is not None:
-        #     merged_bounding_boxes.append({
-        #         "text": "".join(current_text),
-        #         "boundingBox": current_box,
-        #         "result": current_result
-        #     })
-        # Combine original and merged bounding boxes
-        analysed_bounding_boxes.extend(original_bounding_boxes)
-        analysed_bounding_boxes.extend(merged_bounding_boxes)
-        #print("Analysed bounding boxes:", analysed_bounding_boxes)
-    return analysed_bounding_boxes
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
@@ -1559,6 +1485,182 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
         pikepdf_annotations_on_page.append(annotation)
     return pikepdf_annotations_on_page
 def redact_text_pdf(
     filename: str,  # Path to the PDF file to be redacted
     prepared_pdf_image_path: str,  # Path to the prepared PDF image for redaction
@@ -1681,173 +1783,64 @@ def redact_text_pdf(
             for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
                 page_analyser_results = []
                 page_analysed_bounding_boxes = []
                 characters = []
                 pikepdf_annotations_on_page = []
                 decision_process_table_on_page = pd.DataFrame()
-                page_text_outputs = pd.DataFrame()
                 if analysis_type == text_ocr_option:
                     for n, text_container in enumerate(page_layout):
-                        text_container_analyser_results = []
-                        text_container_analysed_bounding_boxes = []
                         characters = []
                         if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
                             characters = get_text_container_characters(text_container)
                         # Create dataframe for all the text on the page
                         line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
-                        # Create page_text_outputs (OCR format outputs)
                         if line_level_text_results_list:
                             # Convert to DataFrame and add to ongoing logging table
                             line_level_text_results_df = pd.DataFrame([{
                                 'page': page_no + 1,
-                                'text': result.text,
                                 'left': result.left,
                                 'top': result.top,
                                 'width': result.width,
                                 'height': result.height
                             } for result in line_level_text_results_list])
-                            page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
-                        # Initialize batching variables
-                        current_batch = ""
-                        current_batch_mapping = []  # List of (start_pos, line_index, OCRResult) tuples
-                        all_text_line_results = []  # Store results for all lines
-                        # First pass: collect all lines into batches
-                        for i, text_line in enumerate(line_level_text_results_list):
-                            if chosen_redact_entities:
-                                if pii_identification_method == "Local":
-                                    #print("chosen_redact_entities:", chosen_redact_entities)
-                                    # Process immediately for local analysis
-                                    text_line_analyser_result = nlp_analyser.analyze(
-                                        text=text_line.text,
-                                        language=language,
-                                        entities=chosen_redact_entities,
-                                        score_threshold=score_threshold,
-                                        return_decision_process=True,
-                                        allow_list=allow_list
-                                    )
-                                    all_text_line_results.append((i, text_line_analyser_result))
-                                elif pii_identification_method == "AWS Comprehend":
-                                    # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
-                                    custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
-                                    text_line_analyser_result = nlp_analyser.analyze(
-                                        text=text_line.text,
-                                        language=language,
-                                        entities=custom_redact_entities,
-                                        score_threshold=score_threshold,
-                                        return_decision_process=True,
-                                        allow_list=allow_list
-                                    )
-                                    all_text_line_results.append((i, text_line_analyser_result))
-                                    if len(text_line.text) >= 3:
-                                        # Add separator between lines
-                                        if current_batch:
-                                            current_batch += " | "
-                                        start_pos = len(current_batch)
-                                        current_batch += text_line.text
-                                        current_batch_mapping.append((start_pos, i, text_line))
-                                        # Process batch if approaching 300 characters or last line
-                                        if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
-                                            print("length of text for Comprehend:", len(current_batch))
-                                            try:
-                                                response = comprehend_client.detect_pii_entities(
-                                                    Text=current_batch,
-                                                    LanguageCode=language
-                                                )
-                                            except Exception as e:
-                                                print(e)
-                                                time.sleep(3)
-                                                response = comprehend_client.detect_pii_entities(
-                                                    Text=current_batch,
-                                                    LanguageCode=language
-                                                )
-                                            comprehend_query_number += 1
-                                            # Process response and map back to original lines
-                                            if response and "Entities" in response:
-                                                for entity in response["Entities"]:
-                                                    entity_start = entity["BeginOffset"]
-                                                    entity_end = entity["EndOffset"]
-                                                    # Find which line this entity belongs to
-                                                    for batch_start, line_idx, original_line in current_batch_mapping:
-                                                        batch_end = batch_start + len(original_line.text)
-                                                        # Check if entity belongs to this line
-                                                        if batch_start <= entity_start < batch_end:
-                                                            # Adjust offsets relative to original line
-                                                            relative_start = entity_start - batch_start
-                                                            relative_end = min(entity_end - batch_start, len(original_line.text))
-                                                            result_text = original_line.text[relative_start:relative_end]
-                                                            if result_text not in allow_list:
-                                                                if entity.get("Type") in chosen_redact_comprehend_entities:
-                                                                    # Create adjusted entity
-                                                                    adjusted_entity = entity.copy()
-                                                                    adjusted_entity["BeginOffset"] = relative_start
-                                                                    adjusted_entity["EndOffset"] = relative_end
-                                                                    recogniser_entity = recognizer_result_from_dict(adjusted_entity)
-                                                                    # Add to results for this line
-                                                                    existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
-                                                                    if not existing_results:
-                                                                        all_text_line_results.append((line_idx, [recogniser_entity]))
-                                                                    else:
-                                                                        existing_results.append(recogniser_entity)
-                                            # Reset batch
-                                            current_batch = ""
-                                            current_batch_mapping = []
-                        # Second pass: process results for each line
-                        for i, text_line in enumerate(line_level_text_results_list):
-                            text_line_analyser_result = []
-                            text_line_bounding_boxes = []
-                            # Get results for this line
-                            line_results = next((results for idx, results in all_text_line_results if idx == i), [])
-                            if line_results:
-                                text_line_analyser_result = line_results
-                                #print("Analysed text container, now merging bounding boxes")
-                                # Merge bounding boxes if very close together
-                                text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
-                                #print("merged bounding boxes")
-                                text_container_analyser_results.extend(text_line_analyser_result)
-                                text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
-                                #print("text_container_analyser_results:", text_container_analyser_results)
-                                page_analyser_results.extend(text_container_analyser_results)  # Add this line
-                                page_analysed_bounding_boxes.extend(text_line_bounding_boxes)  # Add this line
                 #print("page_analyser_results:", page_analyser_results)
@@ -1879,17 +1872,18 @@ def redact_text_pdf(
                 reported_page_no = page_no + 1
                 print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
                 # Write logs
                 # Create decision process table
                 decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, current_loop_page)
                 if not decision_process_table_on_page.empty:
                     all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
-                    #print("all_decision_process_table:", all_decision_process_table)
-                if not page_text_outputs.empty:
-                    page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
-                    all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_outputs])
                 toc = time.perf_counter()

 from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
+from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
+from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
 page_break_value = get_or_create_env_var('page_break_value', '50000')
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
+    print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
+    review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
         custom_recogniser_word_list = custom_recogniser_word_list.iloc[:,0].tolist()
     elif (first_loop_state == False) & (current_loop_page == 999):
         current_loop_page = 0
     if not out_file_paths:
         out_file_paths = []
             combined_out_message = '\n'.join(out_message)
         else:
             combined_out_message = out_message
+        if len(review_out_file_paths) == 1:
+            out_review_file_path = [x for x in out_file_paths if "review_file" in x]
+            review_out_file_paths.extend(out_review_file_path)
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
     # If we have reached the last page, return message
     if current_loop_page >= number_of_pages:
+        print("Reached last page of document:", current_loop_page)
         # Set to a very high number so as not to mix up with subsequent file processing by the user
         current_loop_page = 999
         combined_out_message = out_message
+        if len(review_out_file_paths) == 1:
+            out_review_file_path = [x for x in out_file_paths if "review_file" in x]
+            review_out_file_paths.extend(out_review_file_path)
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
     # Create allow list
     # If string, assume file path
             comprehend_client = ""
             out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
     else:
         comprehend_client = ""
             textract_client = ""
             out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
     else:
         textract_client = ""
             file_path = file.name
         if file_path:
+            pdf_file_name_without_ext = get_file_path_end(file_path)
+            pdf_file_name_with_ext = os.path.basename(file_path)
+            print("Redacting file:", pdf_file_name_with_ext)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
             out_message = "No file selected"
             print(out_message)
+            return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
+            print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
             pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
+                return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
         # If at last page, save to file
         if current_loop_page >= number_of_pages:
+            print("Current page loop:", current_loop_page, "is the last page.")
             latest_file_completed += 1
             current_loop_page = 999
             # Save file
             if is_pdf(file_path) == False:
+                out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
+                #pymupdf_doc[0].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)
+                #print("pymupdf_doc", pymupdf_doc)
+                #print("pymupdf_doc[0]", pymupdf_doc[0])
+                pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
+                out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
             else:
+                out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
+                pymupdf_doc.save(out_redacted_pdf_file_path)
+            out_file_paths.append(out_redacted_pdf_file_path)
             #if log_files_output_paths:
             #    log_files_output_paths.extend(log_files_output_paths)
+            out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
+            logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
             all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
             log_files_output_paths.append(logs_output_file_name)
+            all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
             out_file_paths.append(all_text_output_file_name)
             # Save the gradio_annotation_boxes to a JSON file
             try:
+                #print("Saving annotations to JSON")
+                out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
                 with open(out_annotation_file_path, 'w') as f:
                     json.dump(annotations_all_pages, f)
                 log_files_output_paths.append(out_annotation_file_path)
+                #print("Saving annotations to CSV")
                 # Convert json to csv and also save this
                 #print("annotations_all_pages:", annotations_all_pages)
                 review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
+                out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
+                review_df.to_csv(out_review_file_path, index=None)
+                out_file_paths.append(out_review_file_path)
                 print("Saved review file to csv")
             except Exception as e:
+                print("Could not save annotations to json or csv file:", e)
             # Make a combined message for the file
             if isinstance(out_message, list):
             combined_out_message = combined_out_message + " " + out_time_message  # Ensure this is a single string
             estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
+            #print("Estimated total processing time:", str(estimate_total_processing_time))
         else:
             toc = time.perf_counter()
     if all_request_metadata:
         all_request_metadata_str = '\n'.join(all_request_metadata).strip()
+        all_request_metadata_file_path = output_folder + pdf_file_name_without_ext + "_textract_request_metadata.txt"
         with open(all_request_metadata_file_path, "w") as f:
             f.write(all_request_metadata_str)
     # Ensure no duplicated output files
     log_files_output_paths = list(set(log_files_output_paths))
+    out_file_paths = list(set(out_file_paths))
+    review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
+    #print("log_files_output_paths:", log_files_output_paths)
+    #print("out_file_paths:", out_file_paths)
+    #print("review_out_file_paths:", review_out_file_paths)
+    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         #print("new_custom_recogniser:", new_custom_recogniser)
+        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
+                print("image is a file path", image)
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
                 all_image_annotations_boxes = []
                 for box in merged_redaction_bboxes:
+                    #print("box:", box)
                     x0 = box.left
                     y0 = box.top
                     for line in text_container
                     if isinstance(line, LTTextLine) or isinstance(line, LTTextLineHorizontal)
                     for char in line]
+        #print("Initial characters:", characters)
         return characters
     return []
     line_level_characters_out = []
     #all_line_level_characters_out = []
     character_objects_out = []  # New list to store character objects
+    # character_text_objects_out = []
     # Initialize variables
     full_text = ""
     for char in char_objects:
         character_objects_out.append(char)  # Collect character objects
+        if not isinstance(char, LTAnno):
+            character_text = char.get_text()
+            # character_text_objects_out.append(character_text)
         if isinstance(char, LTAnno):
+            # print("Character line:", "".join(character_text_objects_out))
+            # print("Char is an annotation object:", char)
             added_text = char.get_text()
             # Handle double quotes
+            #added_text = added_text.replace('"', '\\"')  # Escape double quotes
             # Handle space separately by finalizing the word
             full_text += added_text  # Adds space or newline
                 if current_word:
                     word_bboxes.append((current_word, current_word_bbox))
                 # Create an OCRResult for the current line
+                line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0], 2), round(overall_bbox[1], 2), round(overall_bbox[2] - overall_bbox[0], 2), round(overall_bbox[3] - overall_bbox[1], 2)))
                 line_level_characters_out.append(character_objects_out)
                 # Reset for the next line
                 character_objects_out = []
             # Convert special characters to a human-readable format
             #full_text = full_text.encode('latin1', errors='replace').decode('utf-8')
             full_text = clean_unicode_text(full_text)
+            full_text = full_text.strip()
         #print("full_text:", full_text)
+        line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
     #line_level_characters_out = character_objects_out
     return line_level_results_out, line_level_characters_out  # Return both results and character objects
 def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
         pikepdf_annotations_on_page.append(annotation)
     return pikepdf_annotations_on_page
+# def run_page_text_redaction(language: str,  # Language of the PDF content
+#     chosen_redact_entities: List[str],  # List of entities to be redacted
+#     chosen_redact_comprehend_entities: List[str],
+#     line_level_text_results_list: List[str],
+#     line_characters: List,
+#     page_analyser_results: List = [],
+#     page_analysed_bounding_boxes: List = [],
+#     comprehend_client = None, # Connection to AWS Comprehend
+#     allow_list: List[str] = None,  # Optional list of allowed entities
+#     pii_identification_method: str = "Local"
+#     ):
+#     # Initialize batching variables
+#     current_batch = ""
+#     current_batch_mapping = []  # List of (start_pos, line_index, OCRResult) tuples
+#     all_text_line_results = []  # Store results for all lines
+#     text_container_analyser_results = []
+#     text_container_analysed_bounding_boxes = []
+#     # First pass: collect all lines into batches
+#     for i, text_line in enumerate(line_level_text_results_list):
+#         if chosen_redact_entities:
+#             if pii_identification_method == "Local":
+#                 #print("chosen_redact_entities:", chosen_redact_entities)
+#                 # Process immediately for local analysis
+#                 text_line_analyser_result = nlp_analyser.analyze(
+#                     text=text_line.text,
+#                     language=language,
+#                     entities=chosen_redact_entities,
+#                     score_threshold=score_threshold,
+#                     return_decision_process=True,
+#                     allow_list=allow_list
+#                 )
+#                 all_text_line_results.append((i, text_line_analyser_result))
+#             elif pii_identification_method == "AWS Comprehend":
+#                 # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
+#                 custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
+#                 text_line_analyser_result = nlp_analyser.analyze(
+#                     text=text_line.text,
+#                     language=language,
+#                     entities=custom_redact_entities,
+#                     score_threshold=score_threshold,
+#                     return_decision_process=True,
+#                     allow_list=allow_list
+#                 )
+#                 all_text_line_results.append((i, text_line_analyser_result))
+#                 if len(text_line.text) >= 3:
+#                     # Add separator between lines
+#                     if current_batch:
+#                         current_batch += " | "
+#                     start_pos = len(current_batch)
+#                     current_batch += text_line.text
+#                     current_batch_mapping.append((start_pos, i, text_line))
+#                     # Process batch if approaching 300 characters or last line
+#                     if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
+#                         print("length of text for Comprehend:", len(current_batch))
+#                         try:
+#                             response = comprehend_client.detect_pii_entities(
+#                                 Text=current_batch,
+#                                 LanguageCode=language
+#                             )
+#                         except Exception as e:
+#                             print(e)
+#                             time.sleep(3)
+#                             response = comprehend_client.detect_pii_entities(
+#                                 Text=current_batch,
+#                                 LanguageCode=language
+#                             )
+#                         comprehend_query_number += 1
+#                         # Process response and map back to original lines
+#                         if response and "Entities" in response:
+#                             for entity in response["Entities"]:
+#                                 entity_start = entity["BeginOffset"]
+#                                 entity_end = entity["EndOffset"]
+#                                 # Find which line this entity belongs to
+#                                 for batch_start, line_idx, original_line in current_batch_mapping:
+#                                     batch_end = batch_start + len(original_line.text)
+#                                     # Check if entity belongs to this line
+#                                     if batch_start <= entity_start < batch_end:
+#                                         # Adjust offsets relative to original line
+#                                         relative_start = entity_start - batch_start
+#                                         relative_end = min(entity_end - batch_start, len(original_line.text))
+#                                         result_text = original_line.text[relative_start:relative_end]
+#                                         if result_text not in allow_list:
+#                                             if entity.get("Type") in chosen_redact_comprehend_entities:
+#                                                 # Create adjusted entity
+#                                                 adjusted_entity = entity.copy()
+#                                                 adjusted_entity["BeginOffset"] = relative_start
+#                                                 adjusted_entity["EndOffset"] = relative_end
+#                                                 recogniser_entity = recognizer_result_from_dict(adjusted_entity)
+#                                                 # Add to results for this line
+#                                                 existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
+#                                                 if not existing_results:
+#                                                     all_text_line_results.append((line_idx, [recogniser_entity]))
+#                                                 else:
+#                                                     existing_results.append(recogniser_entity)
+#                         # Reset batch
+#                         current_batch = ""
+#                         current_batch_mapping = []
+#     # Second pass: process results for each line
+#     for i, text_line in enumerate(line_level_text_results_list):
+#         text_line_analyser_result = []
+#         text_line_bounding_boxes = []
+#         # Get results for this line
+#         line_results = next((results for idx, results in all_text_line_results if idx == i), [])
+#         if line_results:
+#             text_line_analyser_result = line_results
+#             #print("Analysed text container, now merging bounding boxes")
+#             # Merge bounding boxes if very close together
+#             text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
+#             #print("merged bounding boxes")
+#             text_container_analyser_results.extend(text_line_analyser_result)
+#             #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
+#             #print("text_container_analyser_results:", text_container_analyser_results)
+#             page_analyser_results.extend(text_container_analyser_results)  # Add this line
+#             page_analysed_bounding_boxes.extend(text_line_bounding_boxes)  # Add this line
+#     return page_analysed_bounding_boxes
+# def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
+#     for entity in page_analyser_result:
+#         entity_start = entity.start
+#         entity_end = entity.end
+#         for batch_start, line_idx, original_line, chars in page_text_mapping:
+#             batch_end = batch_start + len(original_line.text)
+#             if batch_start <= entity_start < batch_end:
+#                 relative_start = entity_start - batch_start
+#                 relative_end = min(entity_end - batch_start, len(original_line.text))
+#                 adjusted_entity = copy.deepcopy(entity)
+#                 adjusted_entity.start = relative_start
+#                 adjusted_entity.end = relative_end
+#                 existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
+#                 if existing_entry is None:
+#                     all_text_line_results.append((line_idx, [adjusted_entity]))
+#                 else:
+#                     existing_entry.append(adjusted_entity)
+#                 break
+#     return all_text_line_results
 def redact_text_pdf(
     filename: str,  # Path to the PDF file to be redacted
     prepared_pdf_image_path: str,  # Path to the prepared PDF image for redaction
             for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
+                all_line_characters = []
+                all_line_level_text_results_list = []
                 page_analyser_results = []
                 page_analysed_bounding_boxes = []
                 characters = []
                 pikepdf_annotations_on_page = []
                 decision_process_table_on_page = pd.DataFrame()
+                page_text_ocr_outputs = pd.DataFrame()
                 if analysis_type == text_ocr_option:
                     for n, text_container in enumerate(page_layout):
                         characters = []
+                        #print("text container:", text_container)
                         if isinstance(text_container, LTTextContainer) or isinstance(text_container, LTAnno):
                             characters = get_text_container_characters(text_container)
                         # Create dataframe for all the text on the page
                         line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
+                        ### Create page_text_ocr_outputs (OCR format outputs)
                         if line_level_text_results_list:
                             # Convert to DataFrame and add to ongoing logging table
                             line_level_text_results_df = pd.DataFrame([{
                                 'page': page_no + 1,
+                                'text': (result.text).strip(),
                                 'left': result.left,
                                 'top': result.top,
                                 'width': result.width,
                                 'height': result.height
                             } for result in line_level_text_results_list])
+                            page_text_ocr_outputs = pd.concat([page_text_ocr_outputs, line_level_text_results_df])
+                        all_line_level_text_results_list.extend(line_level_text_results_list)
+                        all_line_characters.extend(line_characters)
+                    ### REDACTION
+                    page_analysed_bounding_boxes = run_page_text_redaction(
+                                                        language,
+                                                        chosen_redact_entities,
+                                                        chosen_redact_comprehend_entities,
+                                                        all_line_level_text_results_list, #line_level_text_results_list,
+                                                        all_line_characters,
+                                                        page_analyser_results,
+                                                        page_analysed_bounding_boxes,
+                                                        comprehend_client,
+                                                        allow_list,
+                                                        pii_identification_method,
+                                                        nlp_analyser,
+                                                        score_threshold,
+                                                        custom_entities,
+                                                        comprehend_query_number
+                                                        )
                 #print("page_analyser_results:", page_analyser_results)
                 reported_page_no = page_no + 1
                 print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
+                # Join extracted text outputs for all lines together
+                if not page_text_ocr_outputs.empty:
+                        page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
+                        all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_ocr_outputs])
                 # Write logs
                 # Create decision process table
                 decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, current_loop_page)
                 if not decision_process_table_on_page.empty:
                     all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
+                    #print("all_decision_process_table:", all_decision_process_table)
                 toc = time.perf_counter()

tools/helper_functions.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import os
 import re
 import gradio as gr
 import pandas as pd
 import unicodedata
 from typing import List
 from gradio_image_annotation import image_annotator
 def reset_state_vars():
     return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
@@ -120,6 +123,8 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
             custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
             #regex_file_name_no_ext = get_file_path_end(regex_file_name)
             output_text = file_type + " file loaded."
             print(output_text)
@@ -229,10 +234,10 @@ async def get_connection_params(request: gr.Request):
     #if 'context' in request_data:
     #     print("Request context dictionary:", request_data['context'])
-    print("Request headers dictionary:", request.headers)
-    print("All host elements", request.client)
-    print("IP address:", request.client.host)
-    print("Query parameters:", dict(request.query_params))
     # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
     #print("Request dictionary to object:", request.request.body())
     print("Session hash:", request.session_hash)
@@ -264,6 +269,23 @@ async def get_connection_params(request: gr.Request):
     elif 'x-amzn-oidc-identity' in request.headers:
         out_session_hash = request.headers['x-amzn-oidc-identity']
         base_folder = "user-files/"
         print("Cognito ID found:", out_session_hash)
     else:

 import os
 import re
+import boto3
+from botocore.exceptions import ClientError
 import gradio as gr
 import pandas as pd
 import unicodedata
 from typing import List
 from gradio_image_annotation import image_annotator
+from tools.auth import user_pool_id
 def reset_state_vars():
     return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
             custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
             #regex_file_name_no_ext = get_file_path_end(regex_file_name)
+            custom_regex.columns = custom_regex.columns.astype(str)
             output_text = file_type + " file loaded."
             print(output_text)
     #if 'context' in request_data:
     #     print("Request context dictionary:", request_data['context'])
+    # print("Request headers dictionary:", request.headers)
+    # print("All host elements", request.client)
+    # print("IP address:", request.client.host)
+    # print("Query parameters:", dict(request.query_params))
     # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
     #print("Request dictionary to object:", request.request.body())
     print("Session hash:", request.session_hash)
     elif 'x-amzn-oidc-identity' in request.headers:
         out_session_hash = request.headers['x-amzn-oidc-identity']
         base_folder = "user-files/"
+        # Fetch email address using Cognito client
+        cognito_client = boto3.client('cognito-idp')
+        try:
+            response = cognito_client.admin_get_user(
+                UserPoolId=user_pool_id,  # Replace with your User Pool ID
+                Username=out_session_hash
+            )
+            email = next(attr['Value'] for attr in response['UserAttributes'] if attr['Name'] == 'email')
+            #print("Email address found:", email)
+            out_session_hash = email
+        except ClientError as e:
+            print("Error fetching user details:", e)
+            email = None
         print("Cognito ID found:", out_session_hash)
     else:

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -7,7 +7,6 @@ spacy.prefer_gpu()
 from spacy.cli.download import download
 import re
-# %%
 model_name = "en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
@@ -34,7 +33,7 @@ def custom_word_list_recogniser(custom_list:List[str]=[]):
         rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
         for term in custom_list
     )
-    print(custom_regex)
     custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)

 from spacy.cli.download import download
 import re
 model_name = "en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
         rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
         for term in custom_list
     )
+    #print(custom_regex)
     custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)

tools/redaction_review.py CHANGED Viewed

@@ -117,13 +117,10 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
         recogniser_dataframe_out = gr.Dataframe(review_dataframe)
         recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
-        print("recogniser_entities_list all options:", recogniser_entities_list)
         recogniser_entities_list = sorted(recogniser_entities_list)
         recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction']  # Remove any existing 'Redaction'
         recogniser_entities_list.insert(0, 'Redaction')  # Add 'Redaction' to the start of the list
-        print("recogniser_entities_list:", recogniser_entities_list)
     zoom_str = str(zoom) + '%'
     recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
@@ -248,6 +245,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
     output_files = []
     output_log_files = []
     #print("File paths in apply_redactions:", file_paths)
@@ -264,7 +262,8 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
     for file_path in file_paths:
         #print("file_path:", file_path)
-        file_base = get_file_path_end(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
@@ -287,7 +286,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
                     draw.rectangle(coords, fill=fill)
-                    image.save(output_folder + file_base + "_redacted.png")
                 doc = [image]
@@ -298,6 +297,9 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
             # If working with pdfs
             elif is_pdf(file_path) == True:
                 pdf_doc = pymupdf.open(file_path)
                 number_of_pages = pdf_doc.page_count
@@ -316,7 +318,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
                         #all_image_annotations[i]['image'] = image_loc.tolist()
                     elif isinstance(image_loc, Image.Image):
                         image = image_loc
-                        #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
                         #image_loc.save(image_out_folder)
                         #all_image_annotations[i]['image'] = image_out_folder
                     elif isinstance(image_loc, str):
@@ -330,25 +332,34 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
             #try:
             if pdf_doc:
-                out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
                 pdf_doc.save(out_pdf_file_path)
                 output_files.append(out_pdf_file_path)
         try:
-            print("Saving annotations to JSON")
-            out_annotation_file_path = output_folder + file_base + '_review_file.json'
             with open(out_annotation_file_path, 'w') as f:
                 json.dump(all_image_annotations, f)
             output_log_files.append(out_annotation_file_path)
-            print("Saving annotations to CSV review file")
             #print("review_file_state:", review_file_state)
             # Convert json to csv and also save this
             review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
-            out_review_file_file_path = output_folder + file_base + '_review_file.csv'
             review_df.to_csv(out_review_file_file_path, index=None)
             output_files.append(out_review_file_file_path)
@@ -367,9 +378,6 @@ def update_entities_df(choice:str, df:pd.DataFrame):
         return df.loc[df["label"]==choice,:]
 def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
-        #print("index", evt.index)
-        #print("value", evt.value)
-        #print("row_value", evt.row_value)
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page

         recogniser_dataframe_out = gr.Dataframe(review_dataframe)
         recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
         recogniser_entities_list = sorted(recogniser_entities_list)
         recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction']  # Remove any existing 'Redaction'
         recogniser_entities_list.insert(0, 'Redaction')  # Add 'Redaction' to the start of the list
     zoom_str = str(zoom) + '%'
     recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
     output_files = []
     output_log_files = []
+    pdf_doc = []
     #print("File paths in apply_redactions:", file_paths)
     for file_path in file_paths:
         #print("file_path:", file_path)
+        file_name_without_ext = get_file_path_end(file_path)
+        file_name_with_ext = os.path.basename(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
                     draw.rectangle(coords, fill=fill)
+                    image.save(output_folder + file_name_without_ext + "_redacted.png")
                 doc = [image]
             # If working with pdfs
             elif is_pdf(file_path) == True:
                 pdf_doc = pymupdf.open(file_path)
+                orig_pdf_file_path = file_path
+                output_files.append(orig_pdf_file_path)
                 number_of_pages = pdf_doc.page_count
                         #all_image_annotations[i]['image'] = image_loc.tolist()
                     elif isinstance(image_loc, Image.Image):
                         image = image_loc
+                        #image_out_folder = output_folder + file_name_without_ext + "_page_" + str(i) + ".png"
                         #image_loc.save(image_out_folder)
                         #all_image_annotations[i]['image'] = image_out_folder
                     elif isinstance(image_loc, str):
             #try:
             if pdf_doc:
+                out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
                 pdf_doc.save(out_pdf_file_path)
                 output_files.append(out_pdf_file_path)
+            else:
+                print("PDF input not found.")
+        # If save_pdf is not true, then add the original pdf to the output files
+        else:
+            if is_pdf(file_path) == True:
+                orig_pdf_file_path = file_path
+                output_files.append(orig_pdf_file_path)
         try:
+            #print("Saving annotations to JSON")
+            out_annotation_file_path = output_folder + file_name_with_ext + '_review_file.json'
             with open(out_annotation_file_path, 'w') as f:
                 json.dump(all_image_annotations, f)
             output_log_files.append(out_annotation_file_path)
+            #print("Saving annotations to CSV review file")
             #print("review_file_state:", review_file_state)
             # Convert json to csv and also save this
             review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
+            out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
             review_df.to_csv(out_review_file_file_path, index=None)
             output_files.append(out_review_file_file_path)
         return df.loc[df["label"]==choice,:]
 def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page