Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Oct 15, 2024

Commit

ebf9010

1 Parent(s): 15026f7

Added 'Review redactions' tab to the app. You can now visually inspect suggested redactions and modify/add with a point and click interface.

Browse files

Files changed (7) hide show

app.py +53 -9
redaction_review.py +88 -0
requirements.txt +1 -0
tools/aws_functions.py +0 -1
tools/file_conversion.py +41 -19
tools/file_redaction.py +467 -293
tools/redaction_review.py +211 -0

app.py CHANGED Viewed

@@ -4,10 +4,13 @@ import socket
 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
 from tools.aws_functions import upload_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
@@ -53,6 +56,10 @@ with app:
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     # Logging state
     feedback_logs_state = gr.State(feedback_logs_folder + 'log.csv')
     feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
@@ -65,9 +72,12 @@ with app:
     session_hash_textbox = gr.Textbox(value="", visible=False) # Invisible text box to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     textract_metadata_textbox = gr.Textbox(value="", visible=False)
     doc_file_name_textbox = gr.Textbox(value="", visible=False)
     data_file_name_textbox = gr.Textbox(value="", visible=False)
     s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     ###
     # UI DESIGN
@@ -106,7 +116,29 @@ with app:
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
@@ -170,17 +202,29 @@ with app:
     ###
     # PDF/IMAGE REDACTION
     ###
-    in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox])
-    document_redact_btn.click(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare_doc").\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
-    text_documents_done.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox]).\
-    then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     ###
     # TABULAR DATA REDACTION

 # By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
 os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
+from gradio_image_annotation import image_annotator
 from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, wipe_logs, custom_regex_load
 from tools.aws_functions import upload_file_to_s3
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
+from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 #from tools.aws_functions import load_data_from_aws
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
+    pdf_doc_state = gr.State([])
+    images_pdf_state = gr.State([]) # List of pdf pages converted to PIL images
+    all_image_annotations_state = gr.State([])
     # Logging state
     feedback_logs_state = gr.State(feedback_logs_folder + 'log.csv')
     feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
     session_hash_textbox = gr.Textbox(value="", visible=False) # Invisible text box to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
     textract_metadata_textbox = gr.Textbox(value="", visible=False)
     doc_file_name_textbox = gr.Textbox(value="", visible=False)
+    doc_file_name_with_extension_textbox = gr.Textbox(value="", visible=False)
     data_file_name_textbox = gr.Textbox(value="", visible=False)
     s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
     estimated_time_taken_number = gr.Number(value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
+    annotate_previous_page = gr.Number(value=1, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
     ###
     # UI DESIGN
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+    # Object annotation
+    with gr.Tab("Review redactions", id="tab_object_annotation"):
+        with gr.Row():
+            annotation_last_page_button = gr.Button("Previous page")
+            annotate_current_page = gr.Number(value=1, label="Current page", precision=0)
+            annotation_next_page_button = gr.Button("Next page")
+        annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
+        annotator = image_annotator(
+            label="Modify redaction boxes",
+            label_list=["Redaction"],
+            label_colors=[(0, 0, 0)],
+            sources=None,#["upload"],
+            show_clear_button=False,
+            show_remove_button=False,
+            interactive=False
+        )
+        output_review_files = gr.File(label="Review output files")
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
     ###
     # PDF/IMAGE REDACTION
     ###
+    in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
+    document_redact_btn.click(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state], api_name="prepare_doc").\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, pdf_doc_state],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state], api_name="redact_doc").\
+                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page])
     # If the output file count text box changes, keep going with redacting each document until done
+    text_documents_done.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state]).\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, pdf_doc_state],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state]).\
+                    then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
+                    then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
+    annotate_current_page.change(
+        modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page]).\
+        then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page])
+    annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page])
+    annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page])
+    #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
+    annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
     ###
     # TABULAR DATA REDACTION

redaction_review.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import gradio as gr
+from gradio_image_annotation import image_annotator
+from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.file_conversion import is_pdf, convert_pdf_to_images
+from tools.helper_functions import get_file_path_end, output_folder
+from tools.file_redaction import redact_page_with_pymupdf
+import json
+import pymupdf
+from PIL import ImageDraw, Image
+file_path = "output/page_as_img_example_complaint_letter_pages_1.png"
+#file_path = "examples/graduate-job-example-cover-letter.pdf"
+if is_pdf(file_path):
+    images = convert_pdf_to_images(file_path)
+    image = images[0]
+    doc = pymupdf.open(file_path)
+else:
+    doc = []
+with open('output/gradio_annotation_boxes.json', 'r') as f:
+    gradio_annotation_boxes = json.load(f)
+example_annotation = {
+    "image": file_path,
+    "boxes": gradio_annotation_boxes
+}
+def apply_redactions(image_annotated:AnnotatedImageData, file_path:str, doc=[]):
+    #print(image_annotated['image'])
+    file_base = get_file_path_end(file_path)
+    image = Image.fromarray(image_annotated['image'].astype('uint8'))
+    draw = ImageDraw.Draw(image)
+    if is_pdf(file_path) == False:
+        for img_annotation_box in image_annotated['boxes']:
+            coords = [img_annotation_box["xmin"],
+            img_annotation_box["ymin"],
+            img_annotation_box["xmax"],
+            img_annotation_box["ymax"]]
+            fill = img_annotation_box["color"]
+            draw.rectangle(coords, fill=fill)
+            image.save(output_folder + file_base + "_additional.png")
+    # If it's a pdf, assume a doc object is available
+    else:
+        doc = redact_page_with_pymupdf(doc, image_annotated, 1, image)
+def crop(annotations):
+    if annotations["boxes"]:
+        box = annotations["boxes"][0]
+        return annotations["image"][
+            box["ymin"]:box["ymax"],
+            box["xmin"]:box["xmax"]
+        ]
+    return None
+def get_boxes_json(annotations):
+    return annotations["boxes"]
+with gr.Blocks() as demo:
+    with gr.Tab("Object annotation", id="tab_object_annotation"):
+        doc_state = gr.State(doc)
+        file_path_textbox = gr.Textbox(value=file_path)
+        annotator = image_annotator(
+            example_annotation,
+            label_list=["Redaction"],
+            label_colors=[(0, 0, 0)],
+        )
+        button_get = gr.Button("Get bounding boxes")
+        button_apply = gr.Button("Apply redactions")
+        json_boxes = gr.JSON()
+        button_get.click(get_boxes_json, annotator, json_boxes)
+        button_apply.click(apply_redactions, inputs=[annotator, file_path_textbox, doc_state])
+if __name__ == "__main__":
+    demo.launch(inbrowser=True)

requirements.txt CHANGED Viewed

@@ -14,3 +14,4 @@ boto3==1.34.158
 pyarrow==14.0.2
 openpyxl==3.1.2
 Faker==22.2.0

 pyarrow==14.0.2
 openpyxl==3.1.2
 Faker==22.2.0
+gradio_image_annotation==0.2.3

tools/aws_functions.py CHANGED Viewed

@@ -14,7 +14,6 @@ aws_var_default = "0"
 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')
-# Launch the Gradio app
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')

 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')
 AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
 print(f'The value of AWS_REGION is {AWS_REGION}')

tools/file_conversion.py CHANGED Viewed

@@ -53,8 +53,18 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
         print("Converting page: ", str(page_num + 1))
         # Convert one page to image
-        image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
         # If no images are returned, break the loop
         if not image:
@@ -64,7 +74,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
         # print("Conversion of page", str(page_num), "to file succeeded.")
         # print("image:", image)
-        #image[0].save(pdf_path + "_" + str(page_num) + ".png", format="PNG")
         images.extend(image)
@@ -105,6 +115,8 @@ def get_input_file_names(file_input):
     all_relevant_files = []
     for file in file_input:
         file_path = file.name
         print(file_path)
@@ -114,15 +126,17 @@ def get_input_file_names(file_input):
         file_extension = os.path.splitext(file_path)[1].lower()
         # Check if the file is an image type
-        if file_extension in ['.jpg', '.jpeg', '.png', '.xlsx', '.csv', '.parquet']:
             all_relevant_files.append(file_path_without_ext)
     all_relevant_files_str = ", ".join(all_relevant_files)
-    print("all_relevant_files_str:", all_relevant_files_str)
-    return all_relevant_files_str
 def prepare_image_or_pdf(
     file_paths: List[str],
@@ -154,7 +168,7 @@ def prepare_image_or_pdf(
     tic = time.perf_counter()
-    # If out message or out_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str):
         out_message = [out_message]
@@ -162,15 +176,17 @@ def prepare_image_or_pdf(
     if first_loop_state==True:
         latest_file_completed = 0
         out_message = []
-        out_file_paths = []
     else:
         print("Now attempting file:", str(latest_file_completed))
-        out_file_paths = []
     if not file_paths:
         file_paths = []
-    #out_file_paths = file_paths
     latest_file_completed = int(latest_file_completed)
@@ -181,7 +197,7 @@ def prepare_image_or_pdf(
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
-        return final_out_message, out_file_paths
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
@@ -217,27 +233,33 @@ def prepare_image_or_pdf(
         if not file_path:
             out_message = "No file selected"
             print(out_message)
-            return out_message, out_file_paths
         if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
             # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 print(out_message)
-                return out_message, out_file_paths
-            out_file_path = process_file(file_path)
-            #print("Out file path at image conversion step:", out_file_path)
         elif in_redact_method == "Simple text analysis - PDFs with selectable text":
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis."
                 print(out_message)
-                return out_message, out_file_paths
-            out_file_path = file_path
-        out_file_paths.append(out_file_path)
         toc = time.perf_counter()
         out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
@@ -247,7 +269,7 @@ def prepare_image_or_pdf(
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
-    return out_message_out, out_file_paths
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)
@@ -270,4 +292,4 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     #print("Out file paths:", out_file_paths)
-    return out_message, out_file_paths

         print("Converting page: ", str(page_num + 1))
         # Convert one page to image
+        out_path  = pdf_path + "_" + str(page_num) + ".png"
+        # Ensure the directory exists
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+        # Check if the image already exists
+        if os.path.exists(out_path):
+            print(f"Loading existing image from {out_path}.")
+            image = [Image.open(out_path)]  # Load the existing image
+        else:
+            image = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=300, use_cropbox=True, use_pdftocairo=False)
+            image[0].save(out_path, format="PNG")  # Save the new image
         # If no images are returned, break the loop
         if not image:
         # print("Conversion of page", str(page_num), "to file succeeded.")
         # print("image:", image)
         images.extend(image)
     all_relevant_files = []
+    #print("file_input:", file_input)
     for file in file_input:
         file_path = file.name
         print(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
+        file_name_with_extension = file_path_without_ext + file_extension
         # Check if the file is an image type
+        if file_extension in ['.jpg', '.jpeg', '.png', '.pdf', '.xlsx', '.csv', '.parquet']:
             all_relevant_files.append(file_path_without_ext)
     all_relevant_files_str = ", ".join(all_relevant_files)
+    #print("all_relevant_files_str:", all_relevant_files_str)
+    return all_relevant_files_str, file_name_with_extension
 def prepare_image_or_pdf(
     file_paths: List[str],
     tic = time.perf_counter()
+    # If out message or converted_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str):
         out_message = [out_message]
     if first_loop_state==True:
         latest_file_completed = 0
         out_message = []
+        converted_file_paths = []
+        image_file_paths = []
     else:
         print("Now attempting file:", str(latest_file_completed))
+        converted_file_paths = []
+        image_file_paths = []
     if not file_paths:
         file_paths = []
+    #converted_file_paths = file_paths
     latest_file_completed = int(latest_file_completed)
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
+        return final_out_message, converted_file_paths, image_file_paths
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
         if not file_path:
             out_message = "No file selected"
             print(out_message)
+            return out_message, converted_file_paths, image_file_paths
         if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
             # Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
                 print(out_message)
+                return out_message, converted_file_paths, image_file_paths
+            converted_file_path = process_file(file_path)
+            image_file_path = converted_file_path
+            #print("Out file path at image conversion step:", converted_file_path)
         elif in_redact_method == "Simple text analysis - PDFs with selectable text":
             if is_pdf(file_path) == False:
                 out_message = "Please upload a PDF file for text analysis."
                 print(out_message)
+                return out_message, converted_file_paths, image_file_paths
+            converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
+            image_file_path = process_file(file_path)
+        converted_file_paths.append(converted_file_path)
+        image_file_paths.extend(image_file_path)
+        #print("file conversion image_file_paths:", image_file_paths)
         toc = time.perf_counter()
         out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
+    return out_message_out, converted_file_paths, image_file_paths
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     file_path_without_ext = get_file_path_end(in_file_path)
     #print("Out file paths:", out_file_paths)
+    return out_message, out_file_paths

tools/file_redaction.py CHANGED Viewed

@@ -4,7 +4,7 @@ import json
 import io
 import os
 from PIL import Image, ImageChops, ImageDraw
-from typing import List, Dict
 import pandas as pd
 #from presidio_image_redactor.entities import ImageRecognizerResult
@@ -12,13 +12,11 @@ from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 import pymupdf
-from pymupdf import Rect
 import gradio as gr
 from gradio import Progress
-from typing import Tuple
 from collections import defaultdict  # For efficient grouping
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
@@ -50,7 +48,7 @@ def sum_numbers_before_seconds(string:str):
     return sum_of_numbers
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", progress=gr.Progress(track_tqdm=True)):
     '''
     Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
     '''
@@ -63,6 +61,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         latest_file_completed = 0
         #out_message = []
         out_file_paths = []
     # If out message is string or out_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str):
@@ -73,9 +72,11 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     latest_file_completed = int(latest_file_completed)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
-        print("Last file reached")
         # Set to a very high number so as not to mix up with subsequent file processing by the user
         latest_file_completed = 99
         final_out_message = '\n'.join(out_message)
@@ -84,7 +85,9 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str
     file_paths_loop = [file_paths[int(latest_file_completed)]]
@@ -110,26 +113,26 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No file selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
         if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-                return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
             print("Redacting file " + file_path_without_ext + " as an image-based file")
-            pdf_images, redaction_logs, logging_file_paths, new_request_metadata = redact_image_pdf(file_path, image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
             # Save file
             if is_pdf(file_path) == False:
                 out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
-                pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
             else:
                 out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
-                pdf_images.save(out_image_file_path)
             out_file_paths.append(out_image_file_path)
             if logging_file_paths:
@@ -137,12 +140,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             out_message.append("File '" + file_path_without_ext + "' successfully redacted")
-            # Save decision making process
-            # output_logs_str = str(output_logs)
-            # logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
-            # with open(logs_output_file_name, "w") as f:
-            #     f.write(output_logs_str)
-            # log_files_output_paths.append(logs_output_file_name)
             logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             redaction_logs.to_csv(logs_output_file_name)
@@ -160,14 +157,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         elif in_redact_method == "Simple text analysis - PDFs with selectable text":
-            print("file_path:", file_path)
             if is_pdf(file_path) == False:
-                return "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'.", None, None
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pdf_text, decision_process_logs, page_text_outputs = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
@@ -200,7 +198,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
     toc = time.perf_counter()
     out_time = f"in {toc - tic:0.1f} seconds."
@@ -223,11 +221,132 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             log_files_output_paths.append(all_request_metadata_file_path)
-    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str
-def redact_page_with_pymupdf(doc, annotations_on_page, page_no, image = None):#, scale=(1,1)):
-    page = doc.load_page(page_no)
     mediabox_height = page.mediabox[3] - page.mediabox[1]
     mediabox_width = page.mediabox[2] - page.mediabox[0]
     rect_height = page.rect.height
@@ -236,62 +355,91 @@ def redact_page_with_pymupdf(doc, annotations_on_page, page_no, image = None):#,
     #print("page_rect_height:", page.rect.height)
     #print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
     for annot in annotations_on_page:
-        if isinstance(annot, CustomImageRecognizerResult):
-            image_page_width, image_page_height = image.size
-            # Calculate scaling factors between PIL image and pymupdf
-            scale_width = rect_width / image_page_width
-            scale_height = rect_height / image_page_height
-            #scale_width = scale[0]
-            #scale_height = scale[1]
-            #print("scale:", scale)
-            # Calculate scaled coordinates
-            x1 = (annot.left * scale_width)# + page_x_adjust
-            new_y1 = (annot.top * scale_height)# - page_y_adjust  # Flip Y0 (since it starts from bottom)
-            x2 = ((annot.left + annot.width) * scale_width)# + page_x_adjust  # Calculate x1
-            new_y2 = ((annot.top + annot.height) * scale_height)# - page_y_adjust  # Calculate y1 correctly
-            rect = Rect(x1, new_y1, x2, new_y2)  # Create the PyMuPDF Rect (y1, y0 are flipped)
-        else:
-            # Calculate scaling factors
-            scale_height = rect_height / mediabox_height if mediabox_height else 1
-            scale_width = rect_width / mediabox_width if mediabox_width else 1
-            # Adjust coordinates based on scaling factors
-            page_x_adjust = (rect_width - mediabox_width) / 2  # Center adjustment
-            page_y_adjust = (rect_height - mediabox_height) / 2  # Center adjustment
-            #print("In the pikepdf conversion function")
-            # Extract the /Rect field
-            rect_field = annot["/Rect"]
-            # Convert the extracted /Rect field to a list of floats (since pikepdf uses Decimal objects)
-            rect_coordinates = [float(coord) for coord in rect_field]
-            # Convert the Y-coordinates (flip using the page height)
-            x1, y1, x2, y2 = rect_coordinates
-            x1 = x1 + page_x_adjust
-            new_y1 = (rect_height - y2) - page_y_adjust
-            x2 = x2 + page_x_adjust
-            new_y2 = (rect_height - y1) - page_y_adjust
-            rect = Rect(x1, new_y1, x2, new_y2)
         # Convert to a PyMuPDF Rect object
         #rect = Rect(rect_coordinates)
-                    # Calculate the middle y value and set height to 1 pixel
-        middle_y = (new_y1 + new_y2) / 2
-        rect_single_pixel_height = Rect(x1, middle_y, x2, middle_y + 1)  # Height of 1 pixel
-        print("rect:", rect)
-        # Add a redaction annotation
-        #page.add_redact_annot(rect)
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
         page.add_redact_annot(rect_single_pixel_height)
@@ -302,10 +450,18 @@ def redact_page_with_pymupdf(doc, annotations_on_page, page_no, image = None):#,
         shape.finish(color=(0, 0, 0), fill=(0, 0, 0))  # Black fill for the rectangle
         shape.commit()
     page.apply_redactions(images=0, graphics=0)
     page.clean_contents()
-    return doc
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
@@ -329,6 +485,7 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     # Reconstruct bounding boxes for substrings of interest
     reconstructed_bboxes = []
     for bbox in bboxes:
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
             line_box = line_info['bounding_box']
@@ -350,7 +507,7 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
                             current_char += 1  # +1 for space if the word doesn't already end with a space
                     if relevant_words:
-                        print("Relevant words:", relevant_words)
                         left = min(word['bounding_box'][0] for word in relevant_words)
                         top = min(word['bounding_box'][1] for word in relevant_words)
                         right = max(word['bounding_box'][2] for word in relevant_words)
@@ -358,6 +515,11 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
                         # Combine the text of all relevant words
                         combined_text = " ".join(word['text'] for word in relevant_words)
                         reconstructed_bbox = CustomImageRecognizerResult(
                             bbox.entity_type,
@@ -393,12 +555,19 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
                 else:
                     new_text = merged_box.text + " " + next_box.text
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
                 new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
                 new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
                 merged_box = CustomImageRecognizerResult(
-                    merged_box.entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
                 )
             else:
                 merged_bboxes.append(merged_box)
@@ -408,7 +577,7 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     return merged_bboxes
-def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
@@ -418,24 +587,25 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     fill = (0, 0, 0)   # Fill colour
     decision_process_output_str = ""
     images = []
     #request_metadata = {}
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     # Also open as pymupdf pdf to apply annotations later on
-    doc = pymupdf.open(file_path)
-    if not image_paths:
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
-        image_paths = process_file(file_path)
-    if not isinstance(image_paths, list):
-        print("Converting image_paths to list")
-        image_paths = [image_paths]
-    #print("Image paths:", image_paths)
-    number_of_pages = len(image_paths[0])
     print("Number of pages:", str(number_of_pages))
@@ -464,57 +634,37 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
     if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
     elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
-    for n in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
         signature_recogniser_results = []
         handwriting_recogniser_results = []
         try:
-            image = image_paths[0][n]#.copy()
-            print("Skipping page", str(n))
-            #print("image:", image)
         except Exception as e:
-            print("Could not redact page:", str(n), "due to:")
             print(e)
             continue
-        if n >= page_min and n < page_max:
-            i = n
             reported_page_number = str(i + 1)
             print("Redacting page", reported_page_number)
-            # Assuming image_paths[i] is your PIL image object
-            try:
-                image = image_paths[0][i]#.copy()
-                #print("image:", image)
-            except Exception as e:
-                print("Could not redact page:", reported_page_number, "due to:")
-                print(e)
-                continue
             # Need image size to convert textract OCR outputs to the correct sizes
             page_width, page_height = image.size
-            # Get the dimensions of the page in points with pymupdf to get relative scale
-            #page = doc.load_page(i)
-            #mu_page_rect = page.rect
-            #mu_page_width = mu_page_rect.width
-            #mu_page_height = max(mu_page_rect.height, page.mediabox[3] - page.mediabox[1])
-            #mu_page_width = max(mu_page_rect.width, page.mediabox[2] - page.mediabox[0])
-            #mu_page_height = mu_page_rect.height
-            # Calculate scaling factors between PIL image and pymupdf
-            #scale_width = mu_page_width / page_width
-            #scale_height = mu_page_height / page_height
-            #scale = (scale_width, scale_height)
             # Possibility to use different languages
             if language == 'en':
                 ocr_lang = 'eng'
@@ -559,21 +709,19 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
-                # Save ocr_with_children_output
-                # ocr_results_with_children_str = str(line_level_ocr_results_with_children)
-                # logs_output_file_name = output_folder + "ocr_with_children_textract.txt"
-                # with open(logs_output_file_name, "w") as f:
-                #     f.write(ocr_results_with_children_str)
             # Step 2: Analyze text and identify PII
-            redaction_bboxes = image_analyser.analyze_text(
-                line_level_ocr_results,
-                line_level_ocr_results_with_children,
-                language=language,
-                entities=chosen_redact_entities,
-                allow_list=allow_list,
-                score_threshold=score_threshold,
-            )
             if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
             elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
@@ -586,30 +734,62 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
             # Merge close bounding boxes
             merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
             # 3. Draw the merged boxes
             if is_pdf(file_path) == False:
                 draw = ImageDraw.Draw(image)
                 for box in merged_redaction_bboxes:
                     x0 = box.left
                     y0 = box.top
                     x1 = x0 + box.width
                     y1 = y0 + box.height
-                    draw.rectangle([x0, y0, x1, y1], fill=fill)
             ## Apply annotations with pymupdf
             else:
-                doc = redact_page_with_pymupdf(doc, merged_redaction_bboxes, i, image)#, scale)
-            #doc.save("image_redact.pdf")
-            # Log OCR results
-            #line_level_ocr_results_str = "Page:" + reported_page_number + "\n" + str(line_level_ocr_results)
-            #all_ocr_results.append(line_level_ocr_results_str)
             # Convert to DataFrame and add to ongoing logging table
             line_level_ocr_results_df = pd.DataFrame([{
@@ -623,43 +803,21 @@ def redact_image_pdf(file_path:str, image_paths:List[str], language:str, chosen_
             all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, line_level_ocr_results_df])
-            # Convert decision process to table
-            # Export the decision making process
-            if merged_redaction_bboxes:
-                # for bbox in merged_redaction_bboxes:
-                #     print(f"Entity: {bbox.entity_type}, Text: {bbox.text}, Bbox: ({bbox.left}, {bbox.top}, {bbox.width}, {bbox.height})")
-                #decision_process_output_str = "Page " + reported_page_number + ":\n" + str(merged_redaction_bboxes)
-                #all_decision_process.append(decision_process_output_str)
-                decision_process_table = pd.DataFrame([{
-                    'page': reported_page_number,
-                    'entity_type': result.entity_type,
-                    'start': result.start,
-                    'end': result.end,
-                    'score': result.score,
-                    'left': result.left,
-                    'top': result.top,
-                    'width': result.width,
-                    'height': result.height,
-                    'text': result.text
-                } for result in merged_redaction_bboxes])
-                all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
         if is_pdf(file_path) == False:
             images.append(image)
-            doc = images
-    # Write OCR results as a log file
-    # line_level_ocr_results_out = "\n".join(all_ocr_results)
-    # with open(ocr_results_file_path, "w") as f:
-    #     f.write(line_level_ocr_results_out)
     all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
     logging_file_paths.append(ocr_results_file_path)
-    return doc, all_decision_process_table, logging_file_paths, request_metadata
 def get_text_container_characters(text_container:LTTextContainer):
@@ -672,23 +830,27 @@ def get_text_container_characters(text_container:LTTextContainer):
         return characters
     return []
-def analyze_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
     '''
     Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
     '''
     text_to_analyze = text_container.text
     #print("text_to_analyze:", text_to_analyze)
-    analyzer_results = nlp_analyser.analyze(text=text_to_analyze,
-                                            language=language,
-                                            entities=chosen_redact_entities,
-                                            score_threshold=score_threshold,
-                                            return_decision_process=True,
-                                            allow_list=allow_list)
-    return analyzer_results
 def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
     '''
@@ -768,16 +930,16 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
     return line_level_results_out, line_level_characters_out  # Return both results and character objects
-def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=0):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
-    analyzed_bounding_boxes = []
-    if len(analyzer_results) > 0 and len(characters) > 0:
         # Extract bounding box coordinates for sorting
         bounding_boxes = []
         text_out = []
-        for result in analyzer_results:
             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             if char_boxes:
@@ -823,14 +985,21 @@ def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, char
                     current_box[2] = char_box[2]  # Extend the current box horizontally
                     current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
                     current_result.end = max(current_result.end, result.end)  # Extend the text range
                     # Add a space if current_text is not empty
                     if current_text:
                         current_text.append(" ")  # Add space between texts
                     current_text.extend(text)
                 else:
                     merged_bounding_boxes.append(
                         {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
                     #print(f"Appending merged box: {current_box}")
                     # Reset current_box and current_y after appending
                     current_box = char_box
@@ -845,39 +1014,39 @@ def merge_text_bounding_boxes(analyzer_results:CustomImageRecognizerResult, char
             #print(f"Appending final box for result: {current_box}")
         if not merged_bounding_boxes:
-            analyzed_bounding_boxes.extend(
                 {"text":text, "boundingBox": char.bbox, "result": result}
-                for result in analyzer_results
                 for char in characters[result.start:result.end]
                 if isinstance(char, LTChar)
             )
         else:
-            analyzed_bounding_boxes.extend(merged_bounding_boxes)
-        #print("Analyzed bounding boxes:\n\n", analyzed_bounding_boxes)
-    return analyzed_bounding_boxes
-def create_text_redaction_process_results(analyzer_results, analyzed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
-    if len(analyzer_results) > 0:
         # Create summary df of annotations to be made
-        analyzed_bounding_boxes_df_new = pd.DataFrame(analyzed_bounding_boxes)
-        analyzed_bounding_boxes_df_text = analyzed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
-        analyzed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
-        analyzed_bounding_boxes_df_new = pd.concat([analyzed_bounding_boxes_df_new, analyzed_bounding_boxes_df_text], axis = 1)
-        analyzed_bounding_boxes_df_new['page'] = page_num + 1
-        decision_process_table = pd.concat([decision_process_table, analyzed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
         #print('\n\ndecision_process_table:\n\n', decision_process_table)
     return decision_process_table
-def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
     annotations_on_page = []
-    for analyzed_bounding_box in analyzed_bounding_boxes:
-        bounding_box = analyzed_bounding_box["boundingBox"]
         annotation = Dictionary(
             Type=Name.Annot,
             Subtype=Name.Square, #Name.Highlight,
@@ -887,7 +1056,7 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
             C=[0, 0, 0],
             IC=[0, 0, 0],
             CA=1, # Transparency
-            T=analyzed_bounding_box["result"].entity_type,
             BS=Dictionary(
                 W=0,                     # Border width: 1 point
                 S=Name.S                # Border style: solid
@@ -896,23 +1065,25 @@ def create_annotations_for_bounding_boxes(analyzed_bounding_boxes):
         annotations_on_page.append(annotation)
     return annotations_on_page
-def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Simple text analysis - PDFs with selectable text", progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
     annotations_all_pages = []
     page_text_outputs_all_pages = pd.DataFrame()
     decision_process_table_all_pages = pd.DataFrame()
     combine_pixel_dist = 20 # Horizontal distance between PII bounding boxes under/equal they are combined into one
     # Open with Pikepdf to get text lines
-    pdf = Pdf.open(filename)
-    # Also open pdf with pymupdf to be able to annotate later while retaining text
-    doc = pymupdf.open(filename)
-    page_num = 0
-    number_of_pages = len(pdf.pages)
     # Check that page_min and page_max are within expected ranges
     if page_max > number_of_pages or page_max == 0:
@@ -920,112 +1091,115 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
     #else:
     #    page_max = page_max - 1
-    if page_min <= 0:
-        page_min = 0
-    else:
-        page_min = page_min - 1
-    print("Page range is",str(page_min), "to", str(page_max))
-    for page_no in range(page_min, page_max):
-        page = pdf.pages[page_no]
-        print("Page number is:", page_no)
-        # The /MediaBox in a PDF specifies the size of the page [left, bottom, right, top]
-        #media_box = page.MediaBox
-        #page_width = media_box[2] - media_box[0]
-        #page_height = media_box[3] - media_box[1]
-        for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
-            page_analyzer_results = []
-            page_analyzed_bounding_boxes = []
-            characters = []
-            annotations_on_page = []
-            decision_process_table_on_page = pd.DataFrame()
-            page_text_outputs = pd.DataFrame()
-            if analysis_type == "Simple text analysis - PDFs with selectable text":
-                for text_container in page_layout:
-                    text_container_analyzer_results = []
-                    text_container_analyzed_bounding_boxes = []
-                    characters = get_text_container_characters(text_container)
-                    # Create dataframe for all the text on the page
-                    line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
-                    print("line_characters:", line_characters)
-                    # Create page_text_outputs (OCR format outputs)
-                    if line_level_text_results_list:
-                        # Convert to DataFrame and add to ongoing logging table
-                        line_level_text_results_df = pd.DataFrame([{
-                            'page': page_no + 1,
-                            'text': result.text,
-                            'left': result.left,
-                            'top': result.top,
-                            'width': result.width,
-                            'height': result.height
-                        } for result in line_level_text_results_list])
-                        page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
-                    # Analyse each line of text in turn for PII and add to list
-                    for i, text_line in enumerate(line_level_text_results_list):
-                        text_line_analyzer_result = []
-                        text_line_bounding_boxes = []
-                        #print("text_line:", text_line.text)
-                        text_line_analyzer_result = analyze_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
-                        # Merge bounding boxes for the line if multiple found close together
-                        if text_line_analyzer_result:
-                            # Merge bounding boxes if very close together
-                            print("text_line_bounding_boxes:", text_line_bounding_boxes)
-                            print("line_characters:")
-                            #print(line_characters[i])
-                            print("".join(char._text for char in line_characters[i]))
-                            text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyzer_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
-                            text_container_analyzer_results.extend(text_line_analyzer_result)
-                            text_container_analyzed_bounding_boxes.extend(text_line_bounding_boxes)
-                        print("\n FINAL text_container_analyzer_results:", text_container_analyzer_results)
-                    page_analyzer_results.extend(text_container_analyzer_results)
-                    page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
-            # Annotate redactions on page
-            annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
-            # Make pymupdf redactions
-            doc = redact_page_with_pymupdf(doc, annotations_on_page, page_no)
-            # Make page annotations
-            #page.Annots = pdf.make_indirect(annotations_on_page)
-            if annotations_on_page:
                 annotations_all_pages.extend([annotations_on_page])
-            print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
-            # Write logs
-            # Create decision process table
-            decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
-            if not decision_process_table_on_page.empty:
-                decision_process_table_all_pages = pd.concat([decision_process_table_all_pages, decision_process_table_on_page])
-            if not page_text_outputs.empty:
-                page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
-                #page_text_outputs.to_csv("text_page_text_outputs.csv")
-                page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
-    return doc, decision_process_table_all_pages, page_text_outputs_all_pages

 import io
 import os
 from PIL import Image, ImageChops, ImageDraw
+from typing import List, Dict, Tuple
 import pandas as pd
 #from presidio_image_redactor.entities import ImageRecognizerResult
 from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
 from pikepdf import Pdf, Dictionary, Name
 import pymupdf
+from pymupdf import Rect
+from fitz import Document, Page
 import gradio as gr
 from gradio import Progress
 from collections import defaultdict  # For efficient grouping
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
     return sum_of_numbers
+def choose_and_run_redactor(file_paths:List[str], prepared_pdf_file_paths:List[str], prepared_pdf_image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, page_min:int=0, page_max:int=999, estimated_time_taken_state:float=0.0, handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], all_request_metadata_str:str = "", all_image_annotations:dict={}, pdf_text=[], progress=gr.Progress(track_tqdm=True)):
     '''
     Based on the type of redaction selected, pass the document file content onto the relevant function and return a redacted document plus processing logs.
     '''
         latest_file_completed = 0
         #out_message = []
         out_file_paths = []
+        pdf_text = []
     # If out message is string or out_file_paths are blank, change to a list so it can be appended to
     if isinstance(out_message, str):
     latest_file_completed = int(latest_file_completed)
+    #pdf_text = []
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
     if latest_file_completed >= len(file_paths):
+        #print("Last file reached")
         # Set to a very high number so as not to mix up with subsequent file processing by the user
         latest_file_completed = 99
         final_out_message = '\n'.join(out_message)
         estimate_total_processing_time = sum_numbers_before_seconds(final_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        #print("Final all_image_annotations:", all_image_annotations)
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimate_total_processing_time, all_request_metadata_str, pdf_text, all_image_annotations
     file_paths_loop = [file_paths[int(latest_file_completed)]]
         else:
             out_message = "No file selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
         if in_redact_method == "Quick image analysis - typed text" or in_redact_method == "Complex image analysis - docs with handwriting/signatures (AWS Textract)":
             #Analyse and redact image-based pdf or image
             if is_pdf_or_image(file_path) == False:
                 out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
             print("Redacting file " + file_path_without_ext + " as an image-based file")
+            pdf_text, redaction_logs, logging_file_paths, new_request_metadata, all_image_annotations = redact_image_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, is_a_pdf, page_min, page_max, in_redact_method, handwrite_signature_checkbox)
             # Save file
             if is_pdf(file_path) == False:
                 out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_img.pdf"
+                pdf_text[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text[1:])
             else:
                 out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
+                pdf_text.save(out_image_file_path)
             out_file_paths.append(out_image_file_path)
             if logging_file_paths:
             out_message.append("File '" + file_path_without_ext + "' successfully redacted")
             logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
             redaction_logs.to_csv(logs_output_file_name)
         elif in_redact_method == "Simple text analysis - PDFs with selectable text":
+            print("file_path for selectable text analysis:", file_path)
             if is_pdf(file_path) == False:
+                out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
+                return out_message, None, None
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pdf_text, decision_process_logs, page_text_outputs, all_image_annotations = redact_text_pdf(file_path, prepared_pdf_image_paths, language, chosen_redact_entities, in_allow_list_flat, page_min, page_max, "Simple text analysis - PDFs with selectable text")
             out_text_file_path = output_folder + file_path_without_ext + "_text_redacted.pdf"
             pdf_text.save(out_text_file_path)
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
     toc = time.perf_counter()
     out_time = f"in {toc - tic:0.1f} seconds."
             log_files_output_paths.append(all_request_metadata_file_path)
+    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pdf_text, all_image_annotations
+def convert_pikepdf_coords_to_pymudf(pymupdf_page, annot):
+    '''
+    Convert annotations from pikepdf to pymupdf format
+    '''
+    mediabox_height = pymupdf_page.mediabox[3] - pymupdf_page.mediabox[1]
+    mediabox_width = pymupdf_page.mediabox[2] - pymupdf_page.mediabox[0]
+    rect_height = pymupdf_page.rect.height
+    rect_width = pymupdf_page.rect.width
+    # Calculate scaling factors
+    #scale_height = rect_height / mediabox_height if mediabox_height else 1
+    #scale_width = rect_width / mediabox_width if mediabox_width else 1
+    # Adjust coordinates based on scaling factors
+    page_x_adjust = (rect_width - mediabox_width) / 2  # Center adjustment
+    page_y_adjust = (rect_height - mediabox_height) / 2  # Center adjustment
+    #print("In the pikepdf conversion function")
+    # Extract the /Rect field
+    rect_field = annot["/Rect"]
+    # Convert the extracted /Rect field to a list of floats (since pikepdf uses Decimal objects)
+    rect_coordinates = [float(coord) for coord in rect_field]
+    # Convert the Y-coordinates (flip using the page height)
+    x1, y1, x2, y2 = rect_coordinates
+    x1 = x1 + page_x_adjust
+    new_y1 = (rect_height - y2) - page_y_adjust
+    x2 = x2 + page_x_adjust
+    new_y2 = (rect_height - y1) - page_y_adjust
+    return x1, new_y1, x2, new_y2
+def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
+    '''
+    Convert annotations from pikepdf coordinates to image coordinates.
+    '''
+    # Get the dimensions of the page in points with pymupdf
+    rect_height = pymupdf_page.rect.height
+    rect_width = pymupdf_page.rect.width
+    # Get the dimensions of the image
+    image_page_width, image_page_height = image.size
+    # Calculate scaling factors between pymupdf and PIL image
+    scale_width = image_page_width / rect_width
+    scale_height = image_page_height / rect_height
+    # Extract the /Rect field
+    rect_field = annot["/Rect"]
+    # Convert the extracted /Rect field to a list of floats
+    rect_coordinates = [float(coord) for coord in rect_field]
+    # Convert the Y-coordinates (flip using the image height)
+    x1, y1, x2, y2 = rect_coordinates
+    x1_image = x1 * scale_width
+    new_y1_image = image_page_height - (y2 * scale_height)  # Flip Y0 (since it starts from bottom)
+    x2_image = x2 * scale_width
+    new_y2_image = image_page_height - (y1 * scale_height)  # Flip Y1
+    return x1_image, new_y1_image, x2_image, new_y2_image
+def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerResult, image:Image):
+    '''
+    Converts an image with redaction coordinates from a CustomImageRecognizerResult to pymupdf coordinates.
+    '''
+    rect_height = pymupdf_page.rect.height
+    rect_width = pymupdf_page.rect.width
+    image_page_width, image_page_height = image.size
+    # Calculate scaling factors between PIL image and pymupdf
+    scale_width = rect_width / image_page_width
+    scale_height = rect_height / image_page_height
+    # Calculate scaled coordinates
+    x1 = (annot.left * scale_width)# + page_x_adjust
+    new_y1 = (annot.top * scale_height)# - page_y_adjust  # Flip Y0 (since it starts from bottom)
+    x2 = ((annot.left + annot.width) * scale_width)# + page_x_adjust  # Calculate x1
+    new_y2 = ((annot.top + annot.height) * scale_height)# - page_y_adjust  # Calculate y1 correctly
+    return x1, new_y1, x2, new_y2
+def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
+    '''
+    Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
+    '''
+    rect_height = pymupdf_page.rect.height
+    rect_width = pymupdf_page.rect.width
+    image_page_width, image_page_height = image.size
+    # Calculate scaling factors between PIL image and pymupdf
+    scale_width = rect_width / image_page_width
+    scale_height = rect_height / image_page_height
+    # Calculate scaled coordinates
+    x1 = (annot["xmin"] * scale_width)# + page_x_adjust
+    new_y1 = (annot["ymin"] * scale_height)# - page_y_adjust  # Flip Y0 (since it starts from bottom)
+    x2 = ((annot["xmax"]) * scale_width)# + page_x_adjust  # Calculate x1
+    new_y2 = ((annot["ymax"]) * scale_height)# - page_y_adjust  # Calculate y1 correctly
+    return x1, new_y1, x2, new_y2
+def move_page_info(file_path: str) -> str:
+    # Split the string at '.png'
+    base, extension = file_path.rsplit('.pdf', 1)
+    # Extract the page info
+    page_info = base.split('page ')[1].split(' of')[0]  # Get the page number
+    new_base = base.replace(f'page {page_info} of ', '')  # Remove the page info from the original position
+    # Construct the new file path
+    new_file_path = f"{new_base}_page_{page_info}.png"
+    return new_file_path
+def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):#, scale=(1,1)):
     mediabox_height = page.mediabox[3] - page.mediabox[1]
     mediabox_width = page.mediabox[2] - page.mediabox[0]
     rect_height = page.rect.height
     #print("page_rect_height:", page.rect.height)
     #print("page mediabox size:", page.mediabox[3] - page.mediabox[1])
+    out_annotation_boxes = {}
+    all_image_annotation_boxes = []
+    image_path = ""
+    if isinstance(image, Image.Image):
+        image_path = move_page_info(str(page))
+        image.save(image_path)
+    elif isinstance(image, str):
+        image_path = image
+        image = Image.open(image_path)
+    #print("annotations_on_page:", annotations_on_page)
+    # Check if this is an object used in the Gradio Annotation component
+    if isinstance (annotations_on_page, dict):
+        annotations_on_page = annotations_on_page["boxes"]
+        #print("annotations on page:", annotations_on_page)
     for annot in annotations_on_page:
+        #print("annot:", annot)
+        # Check if an Image recogniser result, or a Gradio annotation object
+        if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
+            img_annotation_box = {}
+            # Should already be in correct format if img_annotator_box is an input
+            if isinstance(annot, dict):
+                img_annotation_box = annot
+                try:
+                    img_annotation_box["label"] = annot.entity_type
+                except:
+                    img_annotation_box["label"] = "Redaction"
+                x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
+            # Else should be CustomImageRecognizerResult
+            else:
+                x1, pymupdf_y1, x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
+                img_annotation_box["xmin"] = annot.left
+                img_annotation_box["ymin"] = annot.top
+                img_annotation_box["xmax"] = annot.left + annot.width
+                img_annotation_box["ymax"] = annot.top + annot.height
+                img_annotation_box["color"] = (0,0,0)
+                try:
+                    img_annotation_box["label"] = annot.entity_type
+                except:
+                    img_annotation_box["label"] = "Redaction"
+            rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)  # Create the PyMuPDF Rect
+        # Else it should be a pikepdf annotation object
+        else:
+            x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymudf(page, annot)
+            rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
+            img_annotation_box = {}
+            if image:
+                image_x1, image_y1, image_x2, image_y2 = convert_pikepdf_to_image_coords(page, annot, image)
+                img_annotation_box["xmin"] = image_x1
+                img_annotation_box["ymin"] = image_y1
+                img_annotation_box["xmax"] = image_x2
+                img_annotation_box["ymax"] = image_y2
+                img_annotation_box["color"] = (0,0,0)
+                if isinstance(annot, Dictionary):
+                    #print("Trying to get label out of annotation", annot["/T"])
+                    img_annotation_box["label"] = str(annot["/T"])
+                    #print("Label is:", img_annotation_box["label"])
+                else:
+                    img_annotation_box["label"] = "REDACTION"
         # Convert to a PyMuPDF Rect object
         #rect = Rect(rect_coordinates)
+        all_image_annotation_boxes.append(img_annotation_box)
+        # Calculate the middle y value and set height to 1 pixel
+        middle_y = (pymupdf_y1 + pymupdf_y2) / 2
+        rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2)  # Small height in middle of word to remove text
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
         page.add_redact_annot(rect_single_pixel_height)
         shape.finish(color=(0, 0, 0), fill=(0, 0, 0))  # Black fill for the rectangle
         shape.commit()
+    out_annotation_boxes = {
+        "image": image_path, #Image.open(image_path), #image_path,
+        "boxes": all_image_annotation_boxes
+    }
     page.apply_redactions(images=0, graphics=0)
     page.clean_contents()
+    #print("Everything is fine at end of redact_page_with_pymupdf")
+    #print("\nout_annotation_boxes:", out_annotation_boxes)
+    return page, out_annotation_boxes
 def bounding_boxes_overlap(box1, box2):
     """Check if two bounding boxes overlap."""
     # Reconstruct bounding boxes for substrings of interest
     reconstructed_bboxes = []
     for bbox in bboxes:
+        print("bbox:", bbox)
         bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
         for line_text, line_info in combined_results.items():
             line_box = line_info['bounding_box']
                             current_char += 1  # +1 for space if the word doesn't already end with a space
                     if relevant_words:
+                        #print("Relevant words:", relevant_words)
                         left = min(word['bounding_box'][0] for word in relevant_words)
                         top = min(word['bounding_box'][1] for word in relevant_words)
                         right = max(word['bounding_box'][2] for word in relevant_words)
                         # Combine the text of all relevant words
                         combined_text = " ".join(word['text'] for word in relevant_words)
+                        # Calculate new dimensions for the merged box
                         reconstructed_bbox = CustomImageRecognizerResult(
                             bbox.entity_type,
                 else:
                     new_text = merged_box.text + " " + next_box.text
+                if merged_box.text == next_box.text:
+                    new_text = merged_box.text
+                    new_entity_type = merged_box.entity_type  # Keep the original entity type
+                else:
+                    new_text = merged_box.text + " " + next_box.text
+                    new_entity_type = merged_box.entity_type + " - " + next_box.entity_type  # Concatenate entity types
                 new_left = min(merged_box.left, next_box.left)
                 new_top = min(merged_box.top, next_box.top)
                 new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
                 new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
                 merged_box = CustomImageRecognizerResult(
+                    new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
                 )
             else:
                 merged_bboxes.append(merged_box)
     return merged_bboxes
+def redact_image_pdf(file_path:str, prepared_pdf_file_paths:List[str], language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, is_a_pdf:bool=True, page_min:int=0, page_max:int=999, analysis_type:str="Quick image analysis - typed text", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"], request_metadata:str="", progress=Progress(track_tqdm=True)):
     '''
     Take an path for an image of a document, then run this image through the Presidio ImageAnalyzer and PIL to get a redacted page back. Adapted from Presidio ImageRedactorEngine.
     '''
     fill = (0, 0, 0)   # Fill colour
     decision_process_output_str = ""
     images = []
+    all_image_annotations = []
     #request_metadata = {}
     image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     # Also open as pymupdf pdf to apply annotations later on
+    pymupdf_doc = pymupdf.open(file_path)
+    if not prepared_pdf_file_paths:
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
+        prepared_pdf_file_paths = process_file(file_path)
+    if not isinstance(prepared_pdf_file_paths, list):
+        print("Converting prepared_pdf_file_paths to list")
+        prepared_pdf_file_paths = [prepared_pdf_file_paths]
+    #print("Image paths:", prepared_pdf_file_paths)
+    number_of_pages = len(prepared_pdf_file_paths)
     print("Number of pages:", str(number_of_pages))
     if analysis_type == "Quick image analysis - typed text": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
     elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
+    for i in range(0, number_of_pages):
         handwriting_or_signature_boxes = []
         signature_recogniser_results = []
         handwriting_recogniser_results = []
+        # Assuming prepared_pdf_file_paths[i] is your PIL image object
         try:
+            image = prepared_pdf_file_paths[i]#.copy()
+            print("image:", image)
         except Exception as e:
+            print("Could not redact page:", reported_page_number, "due to:")
             print(e)
             continue
+        image_annotations = {"image": image, "boxes": []}
+        #try:
+        print("prepared_pdf_file_paths:", prepared_pdf_file_paths)
+        if i >= page_min and i < page_max:
             reported_page_number = str(i + 1)
             print("Redacting page", reported_page_number)
+            pymupdf_page = pymupdf_doc.load_page(i)
             # Need image size to convert textract OCR outputs to the correct sizes
             page_width, page_height = image.size
             # Possibility to use different languages
             if language == 'en':
                 ocr_lang = 'eng'
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height)
             # Step 2: Analyze text and identify PII
+            if chosen_redact_entities:
+                redaction_bboxes = image_analyser.analyze_text(
+                    line_level_ocr_results,
+                    line_level_ocr_results_with_children,
+                    language=language,
+                    entities=chosen_redact_entities,
+                    allow_list=allow_list,
+                    score_threshold=score_threshold,
+                )
+            else:
+                redaction_bboxes = []
             if analysis_type == "Quick image analysis - typed text": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
             elif analysis_type == "Complex image analysis - docs with handwriting/signatures (AWS Textract)": interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
             # Merge close bounding boxes
             merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
+            # Save image first so that the redactions can be checked after
+            #image.save(output_folder + "page_as_img_" + file_name + "_pages_" + str(reported_page_number) + ".png")
             # 3. Draw the merged boxes
+            #if merged_redaction_bboxes:
             if is_pdf(file_path) == False:
                 draw = ImageDraw.Draw(image)
+                all_image_annotations_boxes = []
                 for box in merged_redaction_bboxes:
+                    print("box:", box)
                     x0 = box.left
                     y0 = box.top
                     x1 = x0 + box.width
                     y1 = y0 + box.height
+                    try:
+                        label = box.entity_type
+                    except:
+                        label = "Redaction"
+                    # Directly append the dictionary with the required keys
+                    all_image_annotations_boxes.append({
+                        "xmin": x0,
+                        "ymin": y0,
+                        "xmax": x1,
+                        "ymax": y1,
+                        "label": label,
+                        "color": (0, 0, 0)
+                    })
+                    draw.rectangle([x0, y0, x1, y1], fill=fill)  # Adjusted to use a list for rectangle
+                image_annotations = {"image": file_path, "boxes": all_image_annotations_boxes}
             ## Apply annotations with pymupdf
             else:
+                pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image)#, scale)
+            # Convert decision process to table
+            decision_process_table = pd.DataFrame([{
+                'page': reported_page_number,
+                'entity_type': result.entity_type,
+                'start': result.start,
+                'end': result.end,
+                'score': result.score,
+                'left': result.left,
+                'top': result.top,
+                'width': result.width,
+                'height': result.height,
+                'text': result.text
+            } for result in merged_redaction_bboxes])
+            all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
             # Convert to DataFrame and add to ongoing logging table
             line_level_ocr_results_df = pd.DataFrame([{
             all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, line_level_ocr_results_df])
         if is_pdf(file_path) == False:
             images.append(image)
+            pymupdf_doc = images
+        all_image_annotations.append(image_annotations)
     all_line_level_ocr_results_df.to_csv(ocr_results_file_path)
     logging_file_paths.append(ocr_results_file_path)
+    return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, all_image_annotations
+###
+# PIKEPDF TEXT PDF REDACTION
+###
 def get_text_container_characters(text_container:LTTextContainer):
         return characters
     return []
+def analyse_text_container(text_container:OCRResult, language:str, chosen_redact_entities:List[str], score_threshold:float, allow_list:List[str]):
     '''
     Take text and bounding boxes in OCRResult format and analyze it for PII using spacy and the Microsoft Presidio package.
     '''
+    analyser_results = []
     text_to_analyze = text_container.text
     #print("text_to_analyze:", text_to_analyze)
+    if chosen_redact_entities:
+        analyser_results = nlp_analyser.analyze(text=text_to_analyze,
+                                                language=language,
+                                                entities=chosen_redact_entities,
+                                                score_threshold=score_threshold,
+                                                return_decision_process=True,
+                                                allow_list=allow_list)
+    print(analyser_results)
+    return analyser_results
 def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tuple[List[OCRResult], List[LTChar]]:
     '''
     return line_level_results_out, line_level_characters_out  # Return both results and character objects
+def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int, vertical_padding:int=0):
     '''
     Merge identified bounding boxes containing PII that are very close to one another
     '''
+    analysed_bounding_boxes = []
+    if len(analyser_results) > 0 and len(characters) > 0:
         # Extract bounding box coordinates for sorting
         bounding_boxes = []
         text_out = []
+        for result in analyser_results:
             char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
             if char_boxes:
                     current_box[2] = char_box[2]  # Extend the current box horizontally
                     current_box[3] = max(current_box[3], char_box[3])  # Ensure the top is the highest
                     current_result.end = max(current_result.end, result.end)  # Extend the text range
+                    try:
+                        current_result.type = current_result.type + " - " + result.type
+                    except:
+                        print("Unable to append new result type.")
                     # Add a space if current_text is not empty
                     if current_text:
                         current_text.append(" ")  # Add space between texts
                     current_text.extend(text)
+                    #print(f"Latest merged box: {current_box[-1]}")
                 else:
                     merged_bounding_boxes.append(
                         {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
                     #print(f"Appending merged box: {current_box}")
+                    #print(f"Latest merged box: {merged_bounding_boxes[-1]}")
                     # Reset current_box and current_y after appending
                     current_box = char_box
             #print(f"Appending final box for result: {current_box}")
         if not merged_bounding_boxes:
+            analysed_bounding_boxes.extend(
                 {"text":text, "boundingBox": char.bbox, "result": result}
+                for result in analyser_results
                 for char in characters[result.start:result.end]
                 if isinstance(char, LTChar)
             )
         else:
+            analysed_bounding_boxes.extend(merged_bounding_boxes)
+        #print("Analyzed bounding boxes:\n\n", analysed_bounding_boxes)
+    return analysed_bounding_boxes
+def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
     decision_process_table = pd.DataFrame()
+    if len(analyser_results) > 0:
         # Create summary df of annotations to be made
+        analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
+        analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
+        analysed_bounding_boxes_df_text.columns = ["type", "start", "end", "score"]
+        analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
+        analysed_bounding_boxes_df_new['page'] = page_num + 1
+        decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
         #print('\n\ndecision_process_table:\n\n', decision_process_table)
     return decision_process_table
+def create_annotations_for_bounding_boxes(analysed_bounding_boxes):
     annotations_on_page = []
+    for analysed_bounding_box in analysed_bounding_boxes:
+        bounding_box = analysed_bounding_box["boundingBox"]
         annotation = Dictionary(
             Type=Name.Annot,
             Subtype=Name.Square, #Name.Highlight,
             C=[0, 0, 0],
             IC=[0, 0, 0],
             CA=1, # Transparency
+            T=analysed_bounding_box["result"].entity_type,
             BS=Dictionary(
                 W=0,                     # Border width: 1 point
                 S=Name.S                # Border style: solid
         annotations_on_page.append(annotation)
     return annotations_on_page
+def redact_text_pdf(filename:str, prepared_pdf_image_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, page_min:int=0, page_max:int=999, analysis_type:str = "Simple text analysis - PDFs with selectable text", progress=Progress(track_tqdm=True)):
     '''
     Redact chosen entities from a pdf that is made up of multiple pages that are not images.
     '''
     annotations_all_pages = []
+    all_image_annotations = []
     page_text_outputs_all_pages = pd.DataFrame()
     decision_process_table_all_pages = pd.DataFrame()
     combine_pixel_dist = 20 # Horizontal distance between PII bounding boxes under/equal they are combined into one
     # Open with Pikepdf to get text lines
+    pikepdf_pdf = Pdf.open(filename)
+    number_of_pages = len(pikepdf_pdf.pages)
+    # Also open pdf with pymupdf to be able to annotate later while retaining text
+    pymupdf_doc = pymupdf.open(filename)
+    page_num = 0
     # Check that page_min and page_max are within expected ranges
     if page_max > number_of_pages or page_max == 0:
     #else:
     #    page_max = page_max - 1
+    if page_min <= 0: page_min = 0
+    else: page_min = page_min - 1
+    print("Page range is",str(page_min + 1), "to", str(page_max))
+    for page_no in range(0, number_of_pages): #range(page_min, page_max):
+        #print("prepared_pdf_image_path:", prepared_pdf_image_path)
+        #print("prepared_pdf_image_path[page_no]:", prepared_pdf_image_path[page_no])
+        image = prepared_pdf_image_path[page_no]
+        image_annotations = {"image": image, "boxes": []}
+        pymupdf_page = pymupdf_doc.load_page(page_no)
+        print("Page number is:", str(page_no + 1))
+        if page_min <= page_no < page_max:
+            for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
+                page_analyser_results = []
+                page_analysed_bounding_boxes = []
+                characters = []
+                annotations_on_page = []
+                decision_process_table_on_page = pd.DataFrame()
+                page_text_outputs = pd.DataFrame()
+                if analysis_type == "Simple text analysis - PDFs with selectable text":
+                    for text_container in page_layout:
+                        text_container_analyser_results = []
+                        text_container_analysed_bounding_boxes = []
+                        characters = get_text_container_characters(text_container)
+                        # Create dataframe for all the text on the page
+                        line_level_text_results_list, line_characters = create_text_bounding_boxes_from_characters(characters)
+                        #print("line_characters:", line_characters)
+                        # Create page_text_outputs (OCR format outputs)
+                        if line_level_text_results_list:
+                            # Convert to DataFrame and add to ongoing logging table
+                            line_level_text_results_df = pd.DataFrame([{
+                                'page': page_no + 1,
+                                'text': result.text,
+                                'left': result.left,
+                                'top': result.top,
+                                'width': result.width,
+                                'height': result.height
+                            } for result in line_level_text_results_list])
+                            page_text_outputs = pd.concat([page_text_outputs, line_level_text_results_df])
+                        # Analyse each line of text in turn for PII and add to list
+                        for i, text_line in enumerate(line_level_text_results_list):
+                            text_line_analyzer_result = []
+                            text_line_bounding_boxes = []
+                            #print("text_line:", text_line.text)
+                            text_line_analyzer_result = analyse_text_container(text_line, language, chosen_redact_entities, score_threshold, allow_list)
+                            # Merge bounding boxes for the line if multiple found close together
+                            if text_line_analyzer_result:
+                                # Merge bounding boxes if very close together
+                                #print("text_line_bounding_boxes:", text_line_bounding_boxes)
+                                #print("line_characters:")
+                                #print(line_characters[i])
+                                #print("".join(char._text for char in line_characters[i]))
+                                text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyzer_result, line_characters[i], combine_pixel_dist, vertical_padding = 0)
+                                text_container_analyser_results.extend(text_line_analyzer_result)
+                                text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
+                            #print("\n FINAL text_container_analyser_results:", text_container_analyser_results)
+                        page_analyser_results.extend(text_container_analyser_results)
+                        page_analysed_bounding_boxes.extend(text_container_analysed_bounding_boxes)
+                # Annotate redactions on page
+                annotations_on_page = create_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
+                # Make page annotations
+                #page.Annots = pdf.make_indirect(annotations_on_page)
+                #if annotations_on_page:
+                # Make pymupdf redactions
+                pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
                 annotations_all_pages.extend([annotations_on_page])
+                print("For page number:", page_no, "there are", len(annotations_all_pages[page_num]), "annotations")
+                # Write logs
+                # Create decision process table
+                decision_process_table_on_page = create_text_redaction_process_results(page_analyser_results, page_analysed_bounding_boxes, page_num)
+                if not decision_process_table_on_page.empty:
+                    decision_process_table_all_pages = pd.concat([decision_process_table_all_pages, decision_process_table_on_page])
+                if not page_text_outputs.empty:
+                    page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
+                    #page_text_outputs.to_csv("text_page_text_outputs.csv")
+                    page_text_outputs_all_pages = pd.concat([page_text_outputs_all_pages, page_text_outputs])
+        all_image_annotations.append(image_annotations)
+    return pymupdf_doc, decision_process_table_all_pages, page_text_outputs_all_pages, all_image_annotations

tools/redaction_review.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import gradio as gr
+import numpy as np
+from typing import List
+from gradio_image_annotation import image_annotator
+from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.file_conversion import is_pdf, convert_pdf_to_images
+from tools.helper_functions import get_file_path_end, output_folder
+from tools.file_redaction import redact_page_with_pymupdf
+import json
+import pymupdf
+from fitz import Document
+from PIL import ImageDraw, Image
+def decrease_page(number:int):
+    '''
+    Decrease page number for review redactions page.
+    '''
+    #print("number:", str(number))
+    if number > 1:
+        return number - 1
+    else:
+        return 1
+def increase_page(number:int, image_annotator_object:AnnotatedImageData):
+    '''
+    Increase page number for review redactions page.
+    '''
+    if not image_annotator_object:
+        return 1
+    max_pages = len(image_annotator_object)
+    if number < max_pages:
+        return number + 1
+    else:
+        return max_pages
+def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
+    #print("\nImage annotator object:", image_annotator_object[0])
+    if not image_annotator_object:
+        return image_annotator(
+        label="Modify redaction boxes",
+        #label_list=["Redaction"],
+        #label_colors=[(0, 0, 0)],
+        sources=["upload"],
+        show_clear_button=False,
+        show_remove_button=False,
+        interactive=False
+    ), gr.Number(label = "Current page", value=1, precision=0)
+    # Check bounding values for current page and page max
+    if page_num > 0:
+        page_num_reported = page_num
+        #page_num = page_num - 1
+    elif page_num == 0: page_num_reported = 1
+    else:
+        page_num = 0
+        page_num_reported = 1
+    page_max_reported = len(image_annotator_object)
+    if page_num_reported > page_max_reported:
+        page_num_reported = page_max_reported
+    out_image_annotator = image_annotator(value = image_annotator_object[page_num_reported - 1],
+        boxes_alpha=0.1,
+        box_thickness=1,
+        #label_list=["Redaction"],
+        #label_colors=[(0, 0, 0)],
+        height='60%',
+        width='60%',
+        box_min_size=1,
+        box_selected_thickness=2,
+        handle_size=4,
+        sources=None,#["upload"],
+        show_clear_button=False,
+        show_remove_button=False,
+        handles_cursor=True,
+        interactive=True
+    )
+    number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
+    return out_image_annotator, number_reported
+def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData]):
+    '''
+    Overwrite current image annotations with modifications
+    '''
+    print("all_image_annotations before:",all_image_annotations)
+    image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
+    #print("image_annotated:", image_annotated)
+    all_image_annotations[previous_page - 1] = image_annotated
+    print("all_image_annotations after:",all_image_annotations)
+    return all_image_annotations, current_page
+def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int):
+    '''
+    Apply modified redactions to a pymupdf
+    '''
+    output_files = []
+    image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
+    all_image_annotations[current_page - 1] = image_annotated
+    if not image_annotated:
+        print("No image annotations found")
+        return doc, all_image_annotations
+    file_path = file_paths[-1].name
+    print("file_path:", file_path)
+    file_base = get_file_path_end(file_path)
+    # If working with image docs
+    if is_pdf(file_path) == False:
+        unredacted_doc = Image.open(file_paths[-1])
+        image = unredacted_doc
+        # try:
+        #     image = Image.open(image_annotated['image'])
+        # except:
+        #     image = Image.fromarray(image_annotated['image'].astype('uint8'))
+        draw = ImageDraw.Draw(unredacted_doc)
+        for img_annotation_box in image_annotated['boxes']:
+            coords = [img_annotation_box["xmin"],
+            img_annotation_box["ymin"],
+            img_annotation_box["xmax"],
+            img_annotation_box["ymax"]]
+            fill = img_annotation_box["color"]
+            draw.rectangle(coords, fill=fill)
+            image.save(output_folder + file_base + "_redacted_mod.png")
+        doc = [image]
+    # If working with pdfs
+    else:
+        unredacted_doc = pymupdf.open(file_path)
+        number_of_pages = unredacted_doc.page_count
+        for i in range(0, number_of_pages):
+            print("Re-redacting page", str(i))
+            image_loc = all_image_annotations[i]['image']
+            print("Image location:", image_loc)
+            # Load in image
+            if isinstance(image_loc, Image.Image):
+                # Save to file so the image annotator can pick it up
+                image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
+                image_loc.save(image_out_folder)
+                image = image_out_folder
+            elif isinstance(image_loc, str):
+                image = Image.open(image_loc)
+            else:
+                image = Image.fromarray(image_loc.astype('uint8'))
+            pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
+            pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
+    #try:
+    out_pdf_file_path = output_folder + file_base + "_redacted_mod.pdf"
+    unredacted_doc.save(out_pdf_file_path)
+    output_files.append(out_pdf_file_path)
+    # Save the gradio_annotation_boxes to a JSON file
+    out_annotation_file_path = output_folder + file_base + '_modified_redactions.json'
+    all_image_annotations_with_lists = all_image_annotations
+    # Convert image arrays to lists for JSON serialization
+    for annotation in all_image_annotations_with_lists:
+        if isinstance(annotation['image'], np.ndarray):
+            annotation['image'] = annotation['image'].tolist()
+        elif isinstance(annotation['image'], Image.Image):
+            annotation['image'] = image_out_folder
+    with open(out_annotation_file_path, 'w') as f:
+        json.dump(all_image_annotations_with_lists, f)
+    output_files.append(out_annotation_file_path)
+    return doc, all_image_annotations, output_files
+def crop(annotations:AnnotatedImageData):
+    if annotations["boxes"]:
+        box = annotations["boxes"][0]
+        return annotations["image"][
+            box["ymin"]:box["ymax"],
+            box["xmin"]:box["xmax"]
+        ]
+    return None
+def get_boxes_json(annotations:AnnotatedImageData):
+    return annotations["boxes"]