Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Dec 17, 2024

Commit

e3365ed

1 Parent(s): 9504619

Started adding in support for custom deny list. Fixed textract call issue. Removed multithreading for now as it mixes up pages

Browse files

Files changed (6) hide show

app.py +1 -1
tools/aws_textract.py +19 -6
tools/custom_image_analyser_engine.py +1 -1
tools/file_conversion.py +85 -93
tools/file_redaction.py +36 -14
tools/load_spacy_model_custom_recognisers.py +27 -29

app.py CHANGED Viewed

@@ -36,7 +36,7 @@ full_comprehend_entity_list.extend(custom_entities)
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
-full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
 language = 'en'

 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
+full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
 language = 'en'

tools/aws_textract.py CHANGED Viewed

@@ -4,6 +4,7 @@ from typing import List
 import io
 #import json
 import pikepdf
 # Example: converting this single page to an image
 #from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
@@ -11,7 +12,7 @@ from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerR
 def extract_textract_metadata(response):
     """Extracts metadata from an AWS Textract response."""
-    print("Document metadata:", response['DocumentMetadata'])
     request_id = response['ResponseMetadata']['RequestId']
     pages = response['DocumentMetadata']['Pages']
@@ -35,16 +36,28 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
             print("Cannot connect to AWS Textract")
             return [], ""  # Return an empty list and an empty string
-    print("Analysing page with AWS Textract")
     # Redact signatures if specified
     if "Redact all identified signatures" in handwrite_signature_checkbox:
-        print("Analysing document with signature detection")
-        response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     else:
-        print("Analysing document without signature detection")
         # Call detect_document_text to extract plain text
-        response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
     # Wrap the response with the page number in the desired format
     wrapped_response = {

 import io
 #import json
 import pikepdf
+import time
 # Example: converting this single page to an image
 #from pdf2image import convert_from_bytes
 from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 def extract_textract_metadata(response):
     """Extracts metadata from an AWS Textract response."""
+    #print("Document metadata:", response['DocumentMetadata'])
     request_id = response['ResponseMetadata']['RequestId']
     pages = response['DocumentMetadata']['Pages']
             print("Cannot connect to AWS Textract")
             return [], ""  # Return an empty list and an empty string
+    #print("Analysing page with AWS Textract")
+    #print("pdf_page_bytes:", pdf_page_bytes)
+    #print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
     # Redact signatures if specified
     if "Redact all identified signatures" in handwrite_signature_checkbox:
+        #print("Analysing document with signature detection")
+        try:
+            response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
+        except Exception as e:
+            print("Textract call failed due to:", e, "trying again in 5 seconds.")
+            time.sleep(5)
+            response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
     else:
+        #print("Analysing document without signature detection")
         # Call detect_document_text to extract plain text
+        try:
+            response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
+        except Exception as e:
+            print("Textract call failed due to:", e, "trying again in 5 seconds.")
+            time.sleep(5)
+            response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
     # Wrap the response with the page number in the desired format
     wrapped_response = {

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -520,7 +520,7 @@ class CustomImageAnalyzerEngine:
                             )
                         except Exception as e:
-                            print(e)
                             time.sleep(3)
                             response = comprehend_client.detect_pii_entities(
                                 Text=current_batch,

                             )
                         except Exception as e:
+                            print("AWS Comprehend call failed due to:", e, "waiting three seconds to try again.")
                             time.sleep(3)
                             response = comprehend_client.detect_pii_entities(
                                 Text=current_batch,

tools/file_conversion.py CHANGED Viewed

@@ -48,127 +48,119 @@ def is_pdf(filename):
-def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
-    """
-    Convert a single page of a PDF to an image and save it as a PNG.
-    Returns the path to the saved image.
-    """
-    try:
-        out_path = f"{pdf_path}_{page_num}.png"
-        # Ensure the directory exists
-        os.makedirs(os.path.dirname(out_path), exist_ok=True)
-        # Check if the image already exists
-        if os.path.exists(out_path):
-            # Load the existing image
-            print(f"Loading existing image for page {page_num + 1}")
-            image = Image.open(out_path)
-        else:
-            # Convert the page to an image
-            print(f"Converting page {page_num + 1}")
-            image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
-                                        dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
-            image = image_l[0]
-            # Convert to greyscale
-            image = image.convert("L")
-            image.save(out_path, format="PNG")
-        return out_path
-    except Exception as e:
-        print(f"Error processing page {page_num + 1}: {e}")
-        return None
-def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
-    """
-    Convert pages of a PDF to images using multithreading.
-    """
-    # Get the number of pages in the PDF
-    page_count = pdfinfo_from_path(pdf_path)['Pages']
-    print(f"Number of pages in PDF: {page_count}")
-    images = []
-    # Use ThreadPoolExecutor to process pages in parallel
-    with ThreadPoolExecutor(max_workers=num_threads) as executor:
-        futures = []
-        for page_num in range(page_min, page_count):
-            futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
-        # Display progress using tqdm
-        for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
-            result = future.result()
-            if result:
-                images.append(result)
-            else:
-                print("A page failed to process.")
-    print("PDF has been converted to images.")
-    return images
-# Example usage
-if __name__ == "__main__":
-    pdf_path = "example.pdf"
-    image_dpi = 200
-    output_images = convert_pdf_to_images(pdf_path, image_dpi=image_dpi, num_threads=8)
-    print("Images saved:", output_images)
-# def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
-#     print("pdf_path in convert_pdf_to_images:", pdf_path)
-#     # Get the number of pages in the PDF
-#     page_count = pdfinfo_from_path(pdf_path)['Pages']
-#     print("Number of pages in PDF: ", str(page_count))
-#     images = []
-#     # Open the PDF file
-#     #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
-#     for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
-#         #print("page_num in convert_pdf_to_images:", page_num)
-#         print("Converting page: ", str(page_num + 1))
-#         # Convert one page to image
-#         out_path  = pdf_path + "_" + str(page_num) + ".png"
-#         # Ensure the directory exists
-#         os.makedirs(os.path.dirname(out_path), exist_ok=True)
-#         # Check if the image already exists
-#         if os.path.exists(out_path):
-#             #print(f"Loading existing image from {out_path}.")
-#             image = Image.open(out_path)  # Load the existing image
-#         else:
-#             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
-#             image = image_l[0]
-#             # Convert to greyscale
-#             image = image.convert("L")
-#             image.save(out_path, format="PNG")  # Save the new image
-#         # If no images are returned, break the loop
-#         if not image:
-#             print("Conversion of page", str(page_num), "to file failed.")
-#             break
-#         # print("Conversion of page", str(page_num), "to file succeeded.")
-#         # print("image:", image)
-#         images.append(out_path)
-#     print("PDF has been converted to images.")
-#     # print("Images:", images)
-#     return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path:str):

+# def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
+#     """
+#     Convert a single page of a PDF to an image and save it as a PNG.
+#     Returns the path to the saved image.
+#     """
+#     try:
+#         out_path = f"{pdf_path}_{page_num}.png"
+#         # Ensure the directory exists
+#         os.makedirs(os.path.dirname(out_path), exist_ok=True)
+#         # Check if the image already exists
+#         if os.path.exists(out_path):
+#             # Load the existing image
+#             print(f"Loading existing image for page {page_num + 1}")
+#             image = Image.open(out_path)
+#         else:
+#             # Convert the page to an image
+#             print(f"Converting page {page_num + 1}")
+#             image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
+#                                         dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
+#             image = image_l[0]
+#             # Convert to greyscale
+#             image = image.convert("L")
+#             image.save(out_path, format="PNG")
+#         return out_path
+#     except Exception as e:
+#         print(f"Error processing page {page_num + 1}: {e}")
+#         return None
+# def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
+#     """
+#     Convert pages of a PDF to images using multithreading.
+#     """
+#     # Get the number of pages in the PDF
+#     page_count = pdfinfo_from_path(pdf_path)['Pages']
+#     print(f"Number of pages in PDF: {page_count}")
+#     images = []
+#     # Use ThreadPoolExecutor to process pages in parallel
+#     with ThreadPoolExecutor(max_workers=num_threads) as executor:
+#         futures = []
+#         for page_num in range(page_min, page_count):
+#             futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
+#         # Display progress using tqdm
+#         for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
+#             result = future.result()
+#             if result:
+#                 images.append(result)
+#             else:
+#                 print("A page failed to process.")
+#     print("PDF has been converted to images.")
+#     return images
+def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
+    print("pdf_path in convert_pdf_to_images:", pdf_path)
+    # Get the number of pages in the PDF
+    page_count = pdfinfo_from_path(pdf_path)['Pages']
+    print("Number of pages in PDF: ", str(page_count))
+    images = []
+    # Open the PDF file
+    #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
+    for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
+        #print("page_num in convert_pdf_to_images:", page_num)
+        print("Converting page: ", str(page_num + 1))
+        # Convert one page to image
+        out_path  = pdf_path + "_" + str(page_num) + ".png"
+        # Ensure the directory exists
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+        # Check if the image already exists
+        if os.path.exists(out_path):
+            #print(f"Loading existing image from {out_path}.")
+            image = Image.open(out_path)  # Load the existing image
+        else:
+            image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
+            image = image_l[0]
+            # Convert to greyscale
+            image = image.convert("L")
+            image.save(out_path, format="PNG")  # Save the new image
+        # If no images are returned, break the loop
+        if not image:
+            print("Conversion of page", str(page_num), "to file failed.")
+            break
+        # print("Conversion of page", str(page_num), "to file succeeded.")
+        # print("image:", image)
+        images.append(out_path)
+    print("PDF has been converted to images.")
+    # print("Images:", images)
+    return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path:str):

tools/file_redaction.py CHANGED Viewed

@@ -26,14 +26,14 @@ from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 from tools.file_conversion import process_file, image_dpi
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
-page_break_value = get_or_create_env_var('page_break_value', '500')
 print(f'The value of page_break_value is {page_break_value}')
 max_time_value = get_or_create_env_var('max_time_value', '999999')
@@ -526,14 +526,14 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
     scale_width = image_page_width / mediabox_width
     scale_height = image_page_height / mediabox_height
-    print("scale_width:", scale_width)
-    print("scale_height:", scale_height)
     rect_to_mediabox_x_scale = mediabox_width / rect_width
     rect_to_mediabox_y_scale = mediabox_height / rect_height
-    print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
-    print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
     # Adjust coordinates based on scaling factors
     x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
@@ -815,8 +815,10 @@ def redact_image_pdf(file_path:str,
                      pymupdf_doc = [],
                      pii_identification_method:str="Local",
                      comprehend_query_number:int=0,
-                     comprehend_client="",
-                     textract_client="",
                      page_break_val:int=int(page_break_value),
                      logging_file_paths:List=[],
                      max_time:int=int(max_time_value),
@@ -847,6 +849,8 @@ def redact_image_pdf(file_path:str,
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
@@ -855,10 +859,19 @@ def redact_image_pdf(file_path:str,
     The function returns a fully or partially-redacted PDF document.
     '''
     file_name = get_file_path_end(file_path)
-    fill = (0, 0, 0)   # Fill colour
-    image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     comprehend_query_number_new = 0
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service unsuccessful.")
@@ -913,8 +926,7 @@ def redact_image_pdf(file_path:str,
             image = prepared_pdf_file_paths[page_no]#.copy()
             #print("image:", image)
         except Exception as e:
-            print("Could not redact page:", reported_page_number, "due to:")
-            print(e)
             continue
         image_annotations = {"image": image, "boxes": []}
@@ -975,7 +987,7 @@ def redact_image_pdf(file_path:str,
                         if not page_exists:  # If the page does not exist, analyze again
                             print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
-                            text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialize it as an empty list
                             if "pages" not in existing_data:
@@ -1405,6 +1417,8 @@ def redact_text_pdf(
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
@@ -1431,7 +1445,9 @@ def redact_text_pdf(
     - pymupdf_doc: List of PyMuPDF documents
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
-    - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
@@ -1441,6 +1457,12 @@ def redact_text_pdf(
         print("Connection to AWS Comprehend service not found.")
         return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     tic = time.perf_counter()

 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
 from tools.file_conversion import process_file, image_dpi
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
 from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
+page_break_value = get_or_create_env_var('page_break_value', '50000')
 print(f'The value of page_break_value is {page_break_value}')
 max_time_value = get_or_create_env_var('max_time_value', '999999')
     scale_width = image_page_width / mediabox_width
     scale_height = image_page_height / mediabox_height
+    #print("scale_width:", scale_width)
+    #print("scale_height:", scale_height)
     rect_to_mediabox_x_scale = mediabox_width / rect_width
     rect_to_mediabox_y_scale = mediabox_height / rect_height
+    #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
+    #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
     # Adjust coordinates based on scaling factors
     x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
                      pymupdf_doc = [],
                      pii_identification_method:str="Local",
                      comprehend_query_number:int=0,
+                     comprehend_client:str="",
+                     textract_client:str="",
+                     custom_recogniser_word_list:List[str]=[],
+                     redact_whole_page_list:List[str]=[],
                      page_break_val:int=int(page_break_value),
                      logging_file_paths:List=[],
                      max_time:int=int(max_time_value),
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
+    - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
+    - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     The function returns a fully or partially-redacted PDF document.
     '''
     file_name = get_file_path_end(file_path)
+    fill = (0, 0, 0)   # Fill colour for redactions
     comprehend_query_number_new = 0
+    # Update custom word list analyser object with any new words that have been added to the custom deny list
+    if custom_recogniser_word_list:
+        nlp_analyser.registry.remove_recognizer("CUSTOM")
+        new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
+        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+    image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service unsuccessful.")
             image = prepared_pdf_file_paths[page_no]#.copy()
             #print("image:", image)
         except Exception as e:
+            print("Could not redact page:", reported_page_number, "due to:", e)
             continue
         image_annotations = {"image": image, "boxes": []}
                         if not page_exists:  # If the page does not exist, analyze again
                             print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
+                            text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
                             # Check if "pages" key exists, if not, initialize it as an empty list
                             if "pages" not in existing_data:
     pii_identification_method: str = "Local",
     comprehend_query_number:int = 0,
     comprehend_client="",
+    custom_recogniser_word_list:List[str]=[],
+    redact_whole_page_list:List[str]=[],
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
     - pymupdf_doc: List of PyMuPDF documents
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
+    - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
+    - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
         print("Connection to AWS Comprehend service not found.")
         return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
+    # Update custom word list analyser object with any new words that have been added to the custom deny list
+    if custom_recogniser_word_list:
+        nlp_analyser.registry.remove_recognizer("CUSTOM")
+        new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
+        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
     tic = time.perf_counter()

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -10,17 +10,37 @@ import re
 # %%
 model_name = "en_core_web_lg" #"en_core_web_trf"
 score_threshold = 0.001
-custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME"]
-# %% [markdown]
 # #### Custom recognisers
-# %%
 # Custom title recogniser
 titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
 titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
 titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
-titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern],
     global_regex_flags=re.DOTALL | re.MULTILINE)
 # %%
@@ -34,7 +54,7 @@ ukpostcode_pattern = Pattern(
 )
 # Define the recognizer with one or more patterns
-ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
 # %%
 # Examples for testing
@@ -134,49 +154,27 @@ class StreetNameRecognizer(EntityRecognizer):
 street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
-# %%
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
     def __init__(self, loaded_spacy_model):
         super().__init__()
         self.nlp = {"en": loaded_spacy_model}
-# %%
-#Load spacy model
-try:
-	import en_core_web_lg
-	nlp = en_core_web_lg.load()
-	print("Successfully imported spaCy model")
-except:
-	download("en_core_web_lg")
-	nlp = spacy.load("en_core_web_lg")
-	print("Successfully downloaded and imported spaCy model")
-# try:
-# 	import en_core_web_sm
-# 	nlp = en_core_web_sm.load()
-# 	print("Successfully imported spaCy model")
-# except:
-# 	download("en_core_web_sm")
-# 	nlp = spacy.load("en_core_web_sm")
-# 	print("Successfully downloaded and imported spaCy model")
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
-# %%
 nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
                 default_score_threshold=score_threshold,
                 supported_languages=["en"],
                 log_decision_process=False,
                 )
-# %%
 nlp_analyser.registry.add_recognizer(street_recogniser)
 nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
 nlp_analyser.registry.add_recognizer(titles_recogniser)

 # %%
 model_name = "en_core_web_lg" #"en_core_web_trf"
 score_threshold = 0.001
+custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
+#Load spacy model
+try:
+	import en_core_web_lg
+	nlp = en_core_web_lg.load()
+	print("Successfully imported spaCy model")
+except:
+	download(model_name)
+	nlp = spacy.load(model_name)
+	print("Successfully downloaded and imported spaCy model")
 # #### Custom recognisers
+# Allow user to create their own recogniser
+def custom_word_list_recogniser(custom_list:List[str]=[]):
+    custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
+    custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
+    custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
+    global_regex_flags=re.DOTALL | re.MULTILINE)
+    return custom_recogniser
+# Initialise custom recogniser that will be overwritten later
+custom_recogniser = custom_word_list_recogniser()
 # Custom title recogniser
 titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
 titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
 titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
+titles_recogniser = PatternRecognizer(supported_entity="TITLES", name="TITLES", patterns = [titles_pattern],
     global_regex_flags=re.DOTALL | re.MULTILINE)
 # %%
 )
 # Define the recognizer with one or more patterns
+ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
 # %%
 # Examples for testing
 street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
     def __init__(self, loaded_spacy_model):
         super().__init__()
         self.nlp = {"en": loaded_spacy_model}
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
 nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
                 default_score_threshold=score_threshold,
                 supported_languages=["en"],
                 log_decision_process=False,
                 )
+# Add custom recognisers to nlp_analyser
 nlp_analyser.registry.add_recognizer(street_recogniser)
 nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
 nlp_analyser.registry.add_recognizer(titles_recogniser)
+nlp_analyser.registry.add_recognizer(custom_recogniser)