Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Dec 19, 2024

Commit

f0c28d7

1 Parent(s): e3365ed

Updated packages. Reinstituted multithreading with page load, now with order protected. Smaller spacy model used for speed. Textract calls should now be faster

Browse files

Files changed (6) hide show

requirements.txt +6 -6
tools/aws_textract.py +3 -3
tools/file_conversion.py +110 -123
tools/file_redaction.py +97 -36
tools/helper_functions.py +8 -3
tools/load_spacy_model_custom_recognisers.py +4 -4

requirements.txt CHANGED Viewed

@@ -7,12 +7,12 @@ presidio_anonymizer==2.2.355
 presidio-image-redactor==0.0.53
 pikepdf==8.15.1
 pandas==2.2.3
-spacy==3.7.5
-en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-#3.8.0.tar.gz
-gradio==5.4.0
-boto3==1.35.54
-pyarrow==17.0.0
 openpyxl==3.1.2
 Faker==22.2.0
 gradio_image_annotation==0.2.5

 presidio-image-redactor==0.0.53
 pikepdf==8.15.1
 pandas==2.2.3
+spacy==3.8.3
+#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.9.0
+boto3==1.35.83
+pyarrow==18.1.0
 openpyxl==3.1.2
 Faker==22.2.0
 gradio_image_annotation==0.2.5

tools/aws_textract.py CHANGED Viewed

@@ -46,8 +46,8 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
         try:
             response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
         except Exception as e:
-            print("Textract call failed due to:", e, "trying again in 5 seconds.")
-            time.sleep(5)
             response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     else:
         #print("Analysing document without signature detection")
@@ -185,7 +185,7 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
                                         if recogniser_result not in handwriting:
                                             handwriting.append(recogniser_result)
-                                            print("Handwriting found:", handwriting[-1])
             # If handwriting or signature, add to bounding box

         try:
             response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
         except Exception as e:
+            print("Textract call failed due to:", e, "trying again in 3 seconds.")
+            time.sleep(3)
             response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     else:
         #print("Analysing document without signature detection")
                                         if recogniser_result not in handwriting:
                                             handwriting.append(recogniser_result)
+                                            #print("Handwriting found:", handwriting[-1])
             # If handwriting or signature, add to bounding box

tools/file_conversion.py CHANGED Viewed

@@ -48,122 +48,112 @@ def is_pdf(filename):
-# def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
-#     """
-#     Convert a single page of a PDF to an image and save it as a PNG.
-#     Returns the path to the saved image.
-#     """
-#     try:
-#         out_path = f"{pdf_path}_{page_num}.png"
-#         # Ensure the directory exists
-#         os.makedirs(os.path.dirname(out_path), exist_ok=True)
-#         # Check if the image already exists
-#         if os.path.exists(out_path):
-#             # Load the existing image
-#             print(f"Loading existing image for page {page_num + 1}")
-#             image = Image.open(out_path)
-#         else:
-#             # Convert the page to an image
-#             print(f"Converting page {page_num + 1}")
-#             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
-#                                         dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
-#             image = image_l[0]
-#             # Convert to greyscale
-#             image = image.convert("L")
-#             image.save(out_path, format="PNG")
-#         return out_path
-#     except Exception as e:
-#         print(f"Error processing page {page_num + 1}: {e}")
-#         return None
-# def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
-#     """
-#     Convert pages of a PDF to images using multithreading.
-#     """
-#     # Get the number of pages in the PDF
-#     page_count = pdfinfo_from_path(pdf_path)['Pages']
-#     print(f"Number of pages in PDF: {page_count}")
-#     images = []
-#     # Use ThreadPoolExecutor to process pages in parallel
-#     with ThreadPoolExecutor(max_workers=num_threads) as executor:
-#         futures = []
-#         for page_num in range(page_min, page_count):
-#             futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
-#         # Display progress using tqdm
-#         for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
-#             result = future.result()
-#             if result:
-#                 images.append(result)
-#             else:
-#                 print("A page failed to process.")
-#     print("PDF has been converted to images.")
-#     return images
-def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
-    print("pdf_path in convert_pdf_to_images:", pdf_path)
-    # Get the number of pages in the PDF
-    page_count = pdfinfo_from_path(pdf_path)['Pages']
-    print("Number of pages in PDF: ", str(page_count))
-    images = []
-    # Open the PDF file
-    #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
-    for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
-        #print("page_num in convert_pdf_to_images:", page_num)
-        print("Converting page: ", str(page_num + 1))
-        # Convert one page to image
-        out_path  = pdf_path + "_" + str(page_num) + ".png"
-        # Ensure the directory exists
-        os.makedirs(os.path.dirname(out_path), exist_ok=True)
-        # Check if the image already exists
-        if os.path.exists(out_path):
-            #print(f"Loading existing image from {out_path}.")
-            image = Image.open(out_path)  # Load the existing image
-        else:
-            image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
-            image = image_l[0]
-            # Convert to greyscale
-            image = image.convert("L")
-            image.save(out_path, format="PNG")  # Save the new image
-        # If no images are returned, break the loop
-        if not image:
-            print("Conversion of page", str(page_num), "to file failed.")
-            break
-        # print("Conversion of page", str(page_num), "to file succeeded.")
-        # print("image:", image)
-        images.append(out_path)
-    print("PDF has been converted to images.")
-    # print("Images:", images)
-    return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
-def process_file(file_path:str):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
@@ -178,7 +168,7 @@ def process_file(file_path:str):
     elif file_extension == '.pdf':
         print(f"{file_path} is a PDF file. Converting to image set")
         # Run your function for processing PDF files here
-        img_object = convert_pdf_to_images(file_path)
     else:
         print(f"{file_path} is not an image or PDF file.")
@@ -195,7 +185,7 @@ def get_input_file_names(file_input):
     file_name_with_extension = ""
     full_file_name = ""
-    print("file_input in input file names:", file_input)
     if isinstance(file_input, dict):
         file_input = os.path.abspath(file_input["name"])
@@ -222,8 +212,6 @@ def get_input_file_names(file_input):
     all_relevant_files_str = ", ".join(all_relevant_files)
-    print("all_relevant_files_str:", all_relevant_files_str)
     return all_relevant_files_str, file_name_with_extension, full_file_name
 def prepare_image_or_pdf(
@@ -253,6 +241,7 @@ def prepare_image_or_pdf(
         out_message (List[str]): List to store output messages.
         first_loop_state (bool): Flag indicating if this is the first iteration.
         number_of_pages (int): integer indicating the number of pages in the document
         all_annotations_object(List of annotation objects): All annotations for current document
         prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
         progress (Progress): Progress tracker for the operation.
@@ -352,11 +341,11 @@ def prepare_image_or_pdf(
         if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
             in_redact_method = tesseract_ocr_option
         # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
         if file_path.endswith(".json"):
             if prepare_for_review == True:
                 if isinstance(file_path, str):
                     with open(file_path, 'r') as json_file:
                         all_annotations_object = json.load(json_file)
@@ -372,11 +361,12 @@ def prepare_image_or_pdf(
                 ]
                 image_file_paths_pages = [int(i) for i in image_file_paths_pages]
-                # If PDF pages have been converted to image files, replace the current image paths in the json to this
                 if image_file_paths:
                     for i, annotation in enumerate(all_annotations_object):
                         annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
                         # Check if the annotation page number exists in the image file paths pages
                         if annotation_page_number in image_file_paths_pages:
@@ -385,7 +375,7 @@ def prepare_image_or_pdf(
                             correct_image_page = annotation_page_number
                             annotation["image"] = image_file_paths[correct_image_page]
                         else:
-                            print("Page not found.")
                     #print("all_annotations_object:", all_annotations_object)
@@ -404,30 +394,24 @@ def prepare_image_or_pdf(
                     json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
-        print("in_redact_method:", in_redact_method)
-        # Convert pdf/image file to correct format for redaction
-        if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
-            if is_pdf_or_image(file_path) == False:
-                out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
-                print(out_message)
-                return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
-            print("In correct preparation area.")
-            print("file_path at process_file:", file_path)
-            converted_file_path = process_file(file_path)
-            image_file_path = converted_file_path
-        elif in_redact_method == text_ocr_option:
-            if is_pdf(file_path) == False:
-                out_message = "Please upload a PDF file for text analysis."
-                print(out_message)
-                return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
             converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
-            image_file_path = process_file(file_path)
         converted_file_paths.append(converted_file_path)
         image_file_paths.extend(image_file_path)
@@ -453,7 +437,10 @@ def prepare_image_or_pdf(
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
         number_of_pages = len(image_file_paths)
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object

+def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple[int, str]:
+    try:
+        out_path = f"{pdf_path}_{page_num}.png"
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+        if os.path.exists(out_path):
+            print(f"Loading existing image for page {page_num + 1}")
+            image = Image.open(out_path)
+        else:
+            print(f"Converting page {page_num + 1}")
+            image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
+                                        dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
+            image = image_l[0]
+            image = image.convert("L")
+            image.save(out_path, format="PNG")
+        return page_num, out_path
+    except Exception as e:
+        print(f"Error processing page {page_num + 1}: {e}")
+        return page_num, None
+def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
+    # If preparing for review, just load the first page
+    if prepare_for_review == True:
+        page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
+    else:
+        page_count = pdfinfo_from_path(pdf_path)['Pages']
+    print(f"Number of pages in PDF: {page_count}")
+    results = []
+    with ThreadPoolExecutor(max_workers=num_threads) as executor:
+        futures = []
+        for page_num in range(page_min, page_count):
+            futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
+        for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
+            page_num, result = future.result()
+            if result:
+                results.append((page_num, result))
+            else:
+                print(f"Page {page_num + 1} failed to process.")
+    # Sort results by page number
+    results.sort(key=lambda x: x[0])
+    images = [result[1] for result in results]
+    print("PDF has been converted to images.")
+    return images
+# def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
+#     print("pdf_path in convert_pdf_to_images:", pdf_path)
+#     # Get the number of pages in the PDF
+#     page_count = pdfinfo_from_path(pdf_path)['Pages']
+#     print("Number of pages in PDF: ", str(page_count))
+#     images = []
+#     # Open the PDF file
+#     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
+#     for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
+#         #print("page_num in convert_pdf_to_images:", page_num)
+#         print("Converting page: ", str(page_num + 1))
+#         # Convert one page to image
+#         out_path  = pdf_path + "_" + str(page_num) + ".png"
+#         # Ensure the directory exists
+#         os.makedirs(os.path.dirname(out_path), exist_ok=True)
+#         # Check if the image already exists
+#         if os.path.exists(out_path):
+#             #print(f"Loading existing image from {out_path}.")
+#             image = Image.open(out_path)  # Load the existing image
+#         else:
+#             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
+#             image = image_l[0]
+#             # Convert to greyscale
+#             image = image.convert("L")
+#             image.save(out_path, format="PNG")  # Save the new image
+#         # If no images are returned, break the loop
+#         if not image:
+#             print("Conversion of page", str(page_num), "to file failed.")
+#             break
+#         # print("Conversion of page", str(page_num), "to file succeeded.")
+#         # print("image:", image)
+#         images.append(out_path)
+#     print("PDF has been converted to images.")
+#     # print("Images:", images)
+#     return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
+def process_file(file_path:str, prepare_for_review:bool=False):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
     elif file_extension == '.pdf':
         print(f"{file_path} is a PDF file. Converting to image set")
         # Run your function for processing PDF files here
+        img_object = convert_pdf_to_images(file_path, prepare_for_review)
     else:
         print(f"{file_path} is not an image or PDF file.")
     file_name_with_extension = ""
     full_file_name = ""
+    #print("file_input in input file names:", file_input)
     if isinstance(file_input, dict):
         file_input = os.path.abspath(file_input["name"])
     all_relevant_files_str = ", ".join(all_relevant_files)
     return all_relevant_files_str, file_name_with_extension, full_file_name
 def prepare_image_or_pdf(
         out_message (List[str]): List to store output messages.
         first_loop_state (bool): Flag indicating if this is the first iteration.
         number_of_pages (int): integer indicating the number of pages in the document
+        current_loop_page_number (int): Current number of loop
         all_annotations_object(List of annotation objects): All annotations for current document
         prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
         progress (Progress): Progress tracker for the operation.
         if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
             in_redact_method = tesseract_ocr_option
         # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
         if file_path.endswith(".json"):
             if prepare_for_review == True:
+                print("Preparing file for review")
                 if isinstance(file_path, str):
                     with open(file_path, 'r') as json_file:
                         all_annotations_object = json.load(json_file)
                 ]
                 image_file_paths_pages = [int(i) for i in image_file_paths_pages]
+                # If PDF pages have been converted to image files, replace the current image paths in the json to this.
                 if image_file_paths:
                     for i, annotation in enumerate(all_annotations_object):
                         annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
+                        #print("Annotation page number:", annotation_page_number)
                         # Check if the annotation page number exists in the image file paths pages
                         if annotation_page_number in image_file_paths_pages:
                             correct_image_page = annotation_page_number
                             annotation["image"] = image_file_paths[correct_image_page]
                         else:
+                            print("Page", annotation_page_number, "image file not found.")
                     #print("all_annotations_object:", all_annotations_object)
                     json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
+        # Must be a pdf or image at this point
+        else:
+            # Convert pdf/image file to correct format for redaction
+            if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
+                if is_pdf_or_image(file_path) == False:
+                    out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
+                    print(out_message)
+                    return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
+            elif in_redact_method == text_ocr_option:
+                if is_pdf(file_path) == False:
+                    out_message = "Please upload a PDF file for text analysis."
+                    print(out_message)
+                    return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
             converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
+            image_file_path = process_file(file_path, prepare_for_review)
         converted_file_paths.append(converted_file_path)
         image_file_paths.extend(image_file_path)
         out_message.append(out_time)
         out_message_out = '\n'.join(out_message)
+    if prepare_for_review == False:
         number_of_pages = len(image_file_paths)
+    else:
+        number_of_pages = len(all_annotations_object)
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object

tools/file_redaction.py CHANGED Viewed

@@ -689,8 +689,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
-    print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
         # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
@@ -906,6 +904,30 @@ def redact_image_pdf(file_path:str,
     if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
     elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
@@ -919,7 +941,7 @@ def redact_image_pdf(file_path:str,
         page_break_return = False
         reported_page_number = str(page_no + 1)
-        print("Redacting page:", reported_page_number)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
@@ -962,49 +984,72 @@ def redact_image_pdf(file_path:str,
                 image_buffer = io.BytesIO()
                 image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                 pdf_page_as_bytes = image_buffer.getvalue()
-                #json_file_path = output_folder + file_name + "_page_" + reported_page_number + "_textract.json"
-                json_file_path = output_folder + file_name + "_textract.json"
-                if not os.path.exists(json_file_path):
                     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                     request_metadata = request_metadata + "\n" + new_request_metadata
-                    wrapped_text_blocks = {"pages":[text_blocks]}
-                    # Write the updated existing_data back to the JSON file
-                    with open(json_file_path, 'w') as json_file:
-                        json.dump(wrapped_text_blocks, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 else:
-                    # Open the file and load the JSON data
-                    print("Found existing Textract json results file.")
-                    with open(json_file_path, 'r') as json_file:
-                        existing_data = json.load(json_file)
-                        # Check if the current reported_page_number exists in the loaded JSON
-                        page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
-                        if not page_exists:  # If the page does not exist, analyze again
-                            print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
-                            text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                            # Check if "pages" key exists, if not, initialize it as an empty list
-                            if "pages" not in existing_data:
-                                existing_data["pages"] = []
-                            # Append the new page data
-                            existing_data["pages"].append(text_blocks)
-                            # Write the updated existing_data back to the JSON file
-                            with open(json_file_path, 'w') as json_file:
-                                json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-                            logging_file_paths.append(json_file_path)
-                            request_metadata = request_metadata + "\n" + new_request_metadata
-                        else:
-                            # If the page exists, retrieve the data
-                            text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
@@ -1124,6 +1169,11 @@ def redact_image_pdf(file_path:str,
                 annotations_all_pages.append(image_annotations)
                 current_loop_page += 1
                 return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
@@ -1142,7 +1192,18 @@ def redact_image_pdf(file_path:str,
             progress.close(_tqdm=progress_bar)
             tqdm._instances.clear()
             return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
@@ -1675,8 +1736,8 @@ def redact_text_pdf(
                 pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
                 #print("Did redact_page_with_pymupdf function")
-                print("For page number:", page_no, "there are", len(image_annotations["boxes"]), "annotations")
                 # Write logs
                 # Create decision process table

     merged_bboxes = []
     grouped_bboxes = defaultdict(list)
         # Process signature and handwriting results
     if signature_recogniser_results or handwriting_recogniser_results:
         if "Redact all identified handwriting" in handwrite_signature_checkbox:
     if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
     elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
+    # If running Textract, check if file already exists. If it does, load in existing data
+    # Import results from json and convert
+    if analysis_type == textract_option:
+        json_file_path = output_folder + file_name + "_textract.json"
+        logging_file_paths.append(json_file_path)
+        if not os.path.exists(json_file_path):
+            no_textract_file = True
+            print("No existing Textract results file found.")
+            existing_data = {}
+            #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+            #logging_file_paths.append(json_file_path)
+            #request_metadata = request_metadata + "\n" + new_request_metadata
+            #wrapped_text_blocks = {"pages":[text_blocks]}
+        else:
+            # Open the file and load the JSON data
+            no_textract_file = False
+            print("Found existing Textract json results file.")
+            with open(json_file_path, 'r') as json_file:
+                existing_data = json.load(json_file)
+    ###
     if current_loop_page == 0: page_loop_start = 0
     else: page_loop_start = current_loop_page
         page_break_return = False
         reported_page_number = str(page_no + 1)
+        #print("Redacting page:", reported_page_number)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
                 image_buffer = io.BytesIO()
                 image.save(image_buffer, format='PNG')  # Save as PNG, or adjust format if needed
                 pdf_page_as_bytes = image_buffer.getvalue()
+                if not existing_data:
                     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                     logging_file_paths.append(json_file_path)
                     request_metadata = request_metadata + "\n" + new_request_metadata
+                    existing_data = {"pages":[text_blocks]}
                 else:
+                    # Check if the current reported_page_number exists in the loaded JSON
+                    page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
+                    if not page_exists:  # If the page does not exist, analyze again
+                        print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
+                        text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+                        # Check if "pages" key exists, if not, initialize it as an empty list
+                        if "pages" not in existing_data:
+                            existing_data["pages"] = []
+                        # Append the new page data
+                        existing_data["pages"].append(text_blocks)
+                        request_metadata = request_metadata + "\n" + new_request_metadata
+                    else:
+                        # If the page exists, retrieve the data
+                        text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
+                # if not os.path.exists(json_file_path):
+                #     text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+                #     logging_file_paths.append(json_file_path)
+                #     request_metadata = request_metadata + "\n" + new_request_metadata
+                #     existing_data = {"pages":[text_blocks]}
+                # else:
+                #     # Open the file and load the JSON data
+                #     print("Found existing Textract json results file.")
+                #     with open(json_file_path, 'r') as json_file:
+                #         existing_data = json.load(json_file)
+                #         # Check if the current reported_page_number exists in the loaded JSON
+                #         page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
+                #         if not page_exists:  # If the page does not exist, analyze again
+                #             print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
+                #             text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+                #             # Check if "pages" key exists, if not, initialize it as an empty list
+                #             if "pages" not in existing_data:
+                #                 existing_data["pages"] = []
+                #             # Append the new page data
+                #             existing_data["pages"].append(text_blocks)
+                #             # Write the updated existing_data back to the JSON file
+                #             with open(json_file_path, 'w') as json_file:
+                #                 json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                #             logging_file_paths.append(json_file_path)
+                #             request_metadata = request_metadata + "\n" + new_request_metadata
+                #         else:
+                #             # If the page exists, retrieve the data
+                #             text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
                 annotations_all_pages.append(image_annotations)
+                if analysis_type == textract_option:
+                    # Write the updated existing textract data back to the JSON file
+                    with open(json_file_path, 'w') as json_file:
+                        json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 current_loop_page += 1
                 return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
             progress.close(_tqdm=progress_bar)
             tqdm._instances.clear()
+            if analysis_type == textract_option:
+                # Write the updated existing textract data back to the JSON file
+                with open(json_file_path, 'w') as json_file:
+                    json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
             return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
+    if analysis_type == textract_option:
+        # Write the updated existing textract data back to the JSON file
+        with open(json_file_path, 'w') as json_file:
+            json.dump(existing_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
     return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
                 pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
                 #print("Did redact_page_with_pymupdf function")
+                reported_page_no = page_no + 1
+                print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
                 # Write logs
                 # Create decision process table

tools/helper_functions.py CHANGED Viewed

@@ -31,9 +31,9 @@ def get_or_create_env_var(var_name, default_value):
 # Names for options labels
-text_ocr_option = "Simple text analysis - docs with selectable text"
-tesseract_ocr_option = "OCR analysis for documents without selectable text - best for typed text"
-textract_option = "Complex image analysis - docs with handwriting/signatures (AWS Textract)"
 local_pii_detector = "Local"
 aws_pii_detector  = "AWS Comprehend"
@@ -263,6 +263,11 @@ async def get_connection_params(request: gr.Request):
         base_folder = "user-files/"
         print("Cognito ID found:", out_session_hash)
     else:
         out_session_hash = request.session_hash
         base_folder = "temp-files/"

 # Names for options labels
+text_ocr_option = "Local model - selectable text"
+tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
+textract_option = "AWS Textract service - all PDF types"
 local_pii_detector = "Local"
 aws_pii_detector  = "AWS Comprehend"
         base_folder = "user-files/"
         print("Cognito ID found:", out_session_hash)
+    elif 'x-amzn-oidc-identity' in request.headers:
+        out_session_hash = request.headers['x-amzn-oidc-identity']
+        base_folder = "user-files/"
+        print("Cognito ID found:", out_session_hash)
     else:
         out_session_hash = request.session_hash
         base_folder = "temp-files/"

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -8,20 +8,20 @@ from spacy.cli.download import download
 import re
 # %%
-model_name = "en_core_web_lg" #"en_core_web_trf"
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
 #Load spacy model
 try:
-	import en_core_web_lg
-	nlp = en_core_web_lg.load()
 	print("Successfully imported spaCy model")
 except:
 	download(model_name)
 	nlp = spacy.load(model_name)
-	print("Successfully downloaded and imported spaCy model")
 # #### Custom recognisers
 # Allow user to create their own recogniser

 import re
 # %%
+model_name = "en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
 #Load spacy model
 try:
+	import en_core_web_sm
+	nlp = en_core_web_sm.load()
 	print("Successfully imported spaCy model")
 except:
 	download(model_name)
 	nlp = spacy.load(model_name)
+	print("Successfully downloaded and imported spaCy model", model_name)
 # #### Custom recognisers
 # Allow user to create their own recogniser