seanpedrickcase commited on
Commit
e3365ed
·
1 Parent(s): 9504619

Started adding in support for custom deny list. Fixed textract call issue. Removed multithreading for now as it mixes up pages

Browse files
app.py CHANGED
@@ -36,7 +36,7 @@ full_comprehend_entity_list.extend(custom_entities)
36
 
37
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
38
 
39
- full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
40
 
41
  language = 'en'
42
 
 
36
 
37
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
38
 
39
+ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
40
 
41
  language = 'en'
42
 
tools/aws_textract.py CHANGED
@@ -4,6 +4,7 @@ from typing import List
4
  import io
5
  #import json
6
  import pikepdf
 
7
  # Example: converting this single page to an image
8
  #from pdf2image import convert_from_bytes
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
@@ -11,7 +12,7 @@ from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerR
11
  def extract_textract_metadata(response):
12
  """Extracts metadata from an AWS Textract response."""
13
 
14
- print("Document metadata:", response['DocumentMetadata'])
15
 
16
  request_id = response['ResponseMetadata']['RequestId']
17
  pages = response['DocumentMetadata']['Pages']
@@ -35,16 +36,28 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
35
  print("Cannot connect to AWS Textract")
36
  return [], "" # Return an empty list and an empty string
37
 
38
- print("Analysing page with AWS Textract")
 
 
39
 
40
  # Redact signatures if specified
41
  if "Redact all identified signatures" in handwrite_signature_checkbox:
42
- print("Analysing document with signature detection")
43
- response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
 
 
 
 
 
44
  else:
45
- print("Analysing document without signature detection")
46
  # Call detect_document_text to extract plain text
47
- response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
 
 
 
 
 
48
 
49
  # Wrap the response with the page number in the desired format
50
  wrapped_response = {
 
4
  import io
5
  #import json
6
  import pikepdf
7
+ import time
8
  # Example: converting this single page to an image
9
  #from pdf2image import convert_from_bytes
10
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
 
12
  def extract_textract_metadata(response):
13
  """Extracts metadata from an AWS Textract response."""
14
 
15
+ #print("Document metadata:", response['DocumentMetadata'])
16
 
17
  request_id = response['ResponseMetadata']['RequestId']
18
  pages = response['DocumentMetadata']['Pages']
 
36
  print("Cannot connect to AWS Textract")
37
  return [], "" # Return an empty list and an empty string
38
 
39
+ #print("Analysing page with AWS Textract")
40
+ #print("pdf_page_bytes:", pdf_page_bytes)
41
+ #print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
42
 
43
  # Redact signatures if specified
44
  if "Redact all identified signatures" in handwrite_signature_checkbox:
45
+ #print("Analysing document with signature detection")
46
+ try:
47
+ response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
48
+ except Exception as e:
49
+ print("Textract call failed due to:", e, "trying again in 5 seconds.")
50
+ time.sleep(5)
51
+ response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
52
  else:
53
+ #print("Analysing document without signature detection")
54
  # Call detect_document_text to extract plain text
55
+ try:
56
+ response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
57
+ except Exception as e:
58
+ print("Textract call failed due to:", e, "trying again in 5 seconds.")
59
+ time.sleep(5)
60
+ response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
61
 
62
  # Wrap the response with the page number in the desired format
63
  wrapped_response = {
tools/custom_image_analyser_engine.py CHANGED
@@ -520,7 +520,7 @@ class CustomImageAnalyzerEngine:
520
  )
521
 
522
  except Exception as e:
523
- print(e)
524
  time.sleep(3)
525
  response = comprehend_client.detect_pii_entities(
526
  Text=current_batch,
 
520
  )
521
 
522
  except Exception as e:
523
+ print("AWS Comprehend call failed due to:", e, "waiting three seconds to try again.")
524
  time.sleep(3)
525
  response = comprehend_client.detect_pii_entities(
526
  Text=current_batch,
tools/file_conversion.py CHANGED
@@ -48,127 +48,119 @@ def is_pdf(filename):
48
 
49
 
50
 
51
- def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
52
- """
53
- Convert a single page of a PDF to an image and save it as a PNG.
54
- Returns the path to the saved image.
55
- """
56
- try:
57
- out_path = f"{pdf_path}_{page_num}.png"
58
 
59
- # Ensure the directory exists
60
- os.makedirs(os.path.dirname(out_path), exist_ok=True)
61
 
62
- # Check if the image already exists
63
- if os.path.exists(out_path):
64
- # Load the existing image
65
- print(f"Loading existing image for page {page_num + 1}")
66
- image = Image.open(out_path)
67
- else:
68
- # Convert the page to an image
69
- print(f"Converting page {page_num + 1}")
70
- image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
71
- dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
72
- image = image_l[0]
73
 
74
- # Convert to greyscale
75
- image = image.convert("L")
76
- image.save(out_path, format="PNG")
77
 
78
- return out_path
79
 
80
- except Exception as e:
81
- print(f"Error processing page {page_num + 1}: {e}")
82
- return None
83
-
84
- def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
85
- """
86
- Convert pages of a PDF to images using multithreading.
87
- """
88
- # Get the number of pages in the PDF
89
- page_count = pdfinfo_from_path(pdf_path)['Pages']
90
- print(f"Number of pages in PDF: {page_count}")
91
 
92
- images = []
93
 
94
- # Use ThreadPoolExecutor to process pages in parallel
95
- with ThreadPoolExecutor(max_workers=num_threads) as executor:
96
- futures = []
97
- for page_num in range(page_min, page_count):
98
- futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
99
 
100
- # Display progress using tqdm
101
- for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
102
- result = future.result()
103
- if result:
104
- images.append(result)
105
- else:
106
- print("A page failed to process.")
107
 
108
- print("PDF has been converted to images.")
109
- return images
110
-
111
- # Example usage
112
- if __name__ == "__main__":
113
- pdf_path = "example.pdf"
114
- image_dpi = 200
115
- output_images = convert_pdf_to_images(pdf_path, image_dpi=image_dpi, num_threads=8)
116
- print("Images saved:", output_images)
117
-
118
 
119
- # def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
120
 
121
- # print("pdf_path in convert_pdf_to_images:", pdf_path)
122
 
123
- # # Get the number of pages in the PDF
124
- # page_count = pdfinfo_from_path(pdf_path)['Pages']
125
- # print("Number of pages in PDF: ", str(page_count))
126
 
127
- # images = []
128
 
129
- # # Open the PDF file
130
- # #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
131
- # for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
132
 
133
- # #print("page_num in convert_pdf_to_images:", page_num)
134
 
135
- # print("Converting page: ", str(page_num + 1))
136
 
137
- # # Convert one page to image
138
- # out_path = pdf_path + "_" + str(page_num) + ".png"
139
 
140
- # # Ensure the directory exists
141
- # os.makedirs(os.path.dirname(out_path), exist_ok=True)
142
 
143
- # # Check if the image already exists
144
- # if os.path.exists(out_path):
145
- # #print(f"Loading existing image from {out_path}.")
146
- # image = Image.open(out_path) # Load the existing image
147
 
148
- # else:
149
- # image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
150
 
151
- # image = image_l[0]
152
 
153
- # # Convert to greyscale
154
- # image = image.convert("L")
155
 
156
- # image.save(out_path, format="PNG") # Save the new image
157
 
158
- # # If no images are returned, break the loop
159
- # if not image:
160
- # print("Conversion of page", str(page_num), "to file failed.")
161
- # break
162
 
163
- # # print("Conversion of page", str(page_num), "to file succeeded.")
164
- # # print("image:", image)
165
 
166
- # images.append(out_path)
167
 
168
- # print("PDF has been converted to images.")
169
- # # print("Images:", images)
170
 
171
- # return images
172
 
173
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
174
  def process_file(file_path:str):
 
48
 
49
 
50
 
51
+ # def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
52
+ # """
53
+ # Convert a single page of a PDF to an image and save it as a PNG.
54
+ # Returns the path to the saved image.
55
+ # """
56
+ # try:
57
+ # out_path = f"{pdf_path}_{page_num}.png"
58
 
59
+ # # Ensure the directory exists
60
+ # os.makedirs(os.path.dirname(out_path), exist_ok=True)
61
 
62
+ # # Check if the image already exists
63
+ # if os.path.exists(out_path):
64
+ # # Load the existing image
65
+ # print(f"Loading existing image for page {page_num + 1}")
66
+ # image = Image.open(out_path)
67
+ # else:
68
+ # # Convert the page to an image
69
+ # print(f"Converting page {page_num + 1}")
70
+ # image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
71
+ # dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
72
+ # image = image_l[0]
73
 
74
+ # # Convert to greyscale
75
+ # image = image.convert("L")
76
+ # image.save(out_path, format="PNG")
77
 
78
+ # return out_path
79
 
80
+ # except Exception as e:
81
+ # print(f"Error processing page {page_num + 1}: {e}")
82
+ # return None
83
+
84
+ # def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
85
+ # """
86
+ # Convert pages of a PDF to images using multithreading.
87
+ # """
88
+ # # Get the number of pages in the PDF
89
+ # page_count = pdfinfo_from_path(pdf_path)['Pages']
90
+ # print(f"Number of pages in PDF: {page_count}")
91
 
92
+ # images = []
93
 
94
+ # # Use ThreadPoolExecutor to process pages in parallel
95
+ # with ThreadPoolExecutor(max_workers=num_threads) as executor:
96
+ # futures = []
97
+ # for page_num in range(page_min, page_count):
98
+ # futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
99
 
100
+ # # Display progress using tqdm
101
+ # for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
102
+ # result = future.result()
103
+ # if result:
104
+ # images.append(result)
105
+ # else:
106
+ # print("A page failed to process.")
107
 
108
+ # print("PDF has been converted to images.")
109
+ # return images
 
 
 
 
 
 
 
 
110
 
111
+ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
112
 
113
+ print("pdf_path in convert_pdf_to_images:", pdf_path)
114
 
115
+ # Get the number of pages in the PDF
116
+ page_count = pdfinfo_from_path(pdf_path)['Pages']
117
+ print("Number of pages in PDF: ", str(page_count))
118
 
119
+ images = []
120
 
121
+ # Open the PDF file
122
+ #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
123
+ for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
124
 
125
+ #print("page_num in convert_pdf_to_images:", page_num)
126
 
127
+ print("Converting page: ", str(page_num + 1))
128
 
129
+ # Convert one page to image
130
+ out_path = pdf_path + "_" + str(page_num) + ".png"
131
 
132
+ # Ensure the directory exists
133
+ os.makedirs(os.path.dirname(out_path), exist_ok=True)
134
 
135
+ # Check if the image already exists
136
+ if os.path.exists(out_path):
137
+ #print(f"Loading existing image from {out_path}.")
138
+ image = Image.open(out_path) # Load the existing image
139
 
140
+ else:
141
+ image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
142
 
143
+ image = image_l[0]
144
 
145
+ # Convert to greyscale
146
+ image = image.convert("L")
147
 
148
+ image.save(out_path, format="PNG") # Save the new image
149
 
150
+ # If no images are returned, break the loop
151
+ if not image:
152
+ print("Conversion of page", str(page_num), "to file failed.")
153
+ break
154
 
155
+ # print("Conversion of page", str(page_num), "to file succeeded.")
156
+ # print("image:", image)
157
 
158
+ images.append(out_path)
159
 
160
+ print("PDF has been converted to images.")
161
+ # print("Images:", images)
162
 
163
+ return images
164
 
165
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
166
  def process_file(file_path:str):
tools/file_redaction.py CHANGED
@@ -26,14 +26,14 @@ from presidio_analyzer import RecognizerResult
26
  from tools.aws_functions import RUN_AWS_FUNCTIONS
27
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
28
  from tools.file_conversion import process_file, image_dpi
29
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
30
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
31
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
32
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
33
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
34
 
35
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
36
- page_break_value = get_or_create_env_var('page_break_value', '500')
37
  print(f'The value of page_break_value is {page_break_value}')
38
 
39
  max_time_value = get_or_create_env_var('max_time_value', '999999')
@@ -526,14 +526,14 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
526
  scale_width = image_page_width / mediabox_width
527
  scale_height = image_page_height / mediabox_height
528
 
529
- print("scale_width:", scale_width)
530
- print("scale_height:", scale_height)
531
 
532
  rect_to_mediabox_x_scale = mediabox_width / rect_width
533
  rect_to_mediabox_y_scale = mediabox_height / rect_height
534
 
535
- print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
536
- print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
537
 
538
  # Adjust coordinates based on scaling factors
539
  x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
@@ -815,8 +815,10 @@ def redact_image_pdf(file_path:str,
815
  pymupdf_doc = [],
816
  pii_identification_method:str="Local",
817
  comprehend_query_number:int=0,
818
- comprehend_client="",
819
- textract_client="",
 
 
820
  page_break_val:int=int(page_break_value),
821
  logging_file_paths:List=[],
822
  max_time:int=int(max_time_value),
@@ -847,6 +849,8 @@ def redact_image_pdf(file_path:str,
847
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
848
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
849
  - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
 
 
850
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
851
  - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
852
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
@@ -855,10 +859,19 @@ def redact_image_pdf(file_path:str,
855
  The function returns a fully or partially-redacted PDF document.
856
  '''
857
  file_name = get_file_path_end(file_path)
858
- fill = (0, 0, 0) # Fill colour
859
- image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
860
  comprehend_query_number_new = 0
861
 
 
 
 
 
 
 
 
 
 
 
862
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
863
  print("Connection to AWS Comprehend service unsuccessful.")
864
 
@@ -913,8 +926,7 @@ def redact_image_pdf(file_path:str,
913
  image = prepared_pdf_file_paths[page_no]#.copy()
914
  #print("image:", image)
915
  except Exception as e:
916
- print("Could not redact page:", reported_page_number, "due to:")
917
- print(e)
918
  continue
919
 
920
  image_annotations = {"image": image, "boxes": []}
@@ -975,7 +987,7 @@ def redact_image_pdf(file_path:str,
975
 
976
  if not page_exists: # If the page does not exist, analyze again
977
  print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
978
- text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, handwrite_signature_checkbox) # Analyse page with Textract
979
 
980
  # Check if "pages" key exists, if not, initialize it as an empty list
981
  if "pages" not in existing_data:
@@ -1405,6 +1417,8 @@ def redact_text_pdf(
1405
  pii_identification_method: str = "Local",
1406
  comprehend_query_number:int = 0,
1407
  comprehend_client="",
 
 
1408
  page_break_val: int = int(page_break_value), # Value for page break
1409
  max_time: int = int(max_time_value),
1410
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
@@ -1431,7 +1445,9 @@ def redact_text_pdf(
1431
  - pymupdf_doc: List of PyMuPDF documents
1432
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
1433
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
1434
- - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
 
 
1435
  - page_break_val: Value for page break
1436
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1437
  - progress: Progress tracking object
@@ -1441,6 +1457,12 @@ def redact_text_pdf(
1441
  print("Connection to AWS Comprehend service not found.")
1442
 
1443
  return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
 
 
 
 
 
 
1444
 
1445
  tic = time.perf_counter()
1446
 
 
26
  from tools.aws_functions import RUN_AWS_FUNCTIONS
27
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
28
  from tools.file_conversion import process_file, image_dpi
29
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
30
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
31
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
32
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
33
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
34
 
35
  # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
36
+ page_break_value = get_or_create_env_var('page_break_value', '50000')
37
  print(f'The value of page_break_value is {page_break_value}')
38
 
39
  max_time_value = get_or_create_env_var('max_time_value', '999999')
 
526
  scale_width = image_page_width / mediabox_width
527
  scale_height = image_page_height / mediabox_height
528
 
529
+ #print("scale_width:", scale_width)
530
+ #print("scale_height:", scale_height)
531
 
532
  rect_to_mediabox_x_scale = mediabox_width / rect_width
533
  rect_to_mediabox_y_scale = mediabox_height / rect_height
534
 
535
+ #print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
536
+ #print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
537
 
538
  # Adjust coordinates based on scaling factors
539
  x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
 
815
  pymupdf_doc = [],
816
  pii_identification_method:str="Local",
817
  comprehend_query_number:int=0,
818
+ comprehend_client:str="",
819
+ textract_client:str="",
820
+ custom_recogniser_word_list:List[str]=[],
821
+ redact_whole_page_list:List[str]=[],
822
  page_break_val:int=int(page_break_value),
823
  logging_file_paths:List=[],
824
  max_time:int=int(max_time_value),
 
849
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
850
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
851
  - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
852
+ - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
853
+ - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
854
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
855
  - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
856
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
 
859
  The function returns a fully or partially-redacted PDF document.
860
  '''
861
  file_name = get_file_path_end(file_path)
862
+ fill = (0, 0, 0) # Fill colour for redactions
 
863
  comprehend_query_number_new = 0
864
 
865
+ # Update custom word list analyser object with any new words that have been added to the custom deny list
866
+ if custom_recogniser_word_list:
867
+ nlp_analyser.registry.remove_recognizer("CUSTOM")
868
+ new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
869
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
870
+
871
+
872
+ image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
873
+
874
+
875
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
876
  print("Connection to AWS Comprehend service unsuccessful.")
877
 
 
926
  image = prepared_pdf_file_paths[page_no]#.copy()
927
  #print("image:", image)
928
  except Exception as e:
929
+ print("Could not redact page:", reported_page_number, "due to:", e)
 
930
  continue
931
 
932
  image_annotations = {"image": image, "boxes": []}
 
987
 
988
  if not page_exists: # If the page does not exist, analyze again
989
  print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
990
+ text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
991
 
992
  # Check if "pages" key exists, if not, initialize it as an empty list
993
  if "pages" not in existing_data:
 
1417
  pii_identification_method: str = "Local",
1418
  comprehend_query_number:int = 0,
1419
  comprehend_client="",
1420
+ custom_recogniser_word_list:List[str]=[],
1421
+ redact_whole_page_list:List[str]=[],
1422
  page_break_val: int = int(page_break_value), # Value for page break
1423
  max_time: int = int(max_time_value),
1424
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
 
1445
  - pymupdf_doc: List of PyMuPDF documents
1446
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
1447
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
1448
+ - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
1449
+ - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
1450
+ - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
1451
  - page_break_val: Value for page break
1452
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1453
  - progress: Progress tracking object
 
1457
  print("Connection to AWS Comprehend service not found.")
1458
 
1459
  return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
1460
+
1461
+ # Update custom word list analyser object with any new words that have been added to the custom deny list
1462
+ if custom_recogniser_word_list:
1463
+ nlp_analyser.registry.remove_recognizer("CUSTOM")
1464
+ new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
1465
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1466
 
1467
  tic = time.perf_counter()
1468
 
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -10,17 +10,37 @@ import re
10
  # %%
11
  model_name = "en_core_web_lg" #"en_core_web_trf"
12
  score_threshold = 0.001
13
- custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME"]
 
 
 
 
 
 
 
 
 
 
 
14
 
15
- # %% [markdown]
16
  # #### Custom recognisers
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- # %%
19
  # Custom title recogniser
20
  titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
21
  titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
22
  titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
23
- titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern],
24
  global_regex_flags=re.DOTALL | re.MULTILINE)
25
 
26
  # %%
@@ -34,7 +54,7 @@ ukpostcode_pattern = Pattern(
34
  )
35
 
36
  # Define the recognizer with one or more patterns
37
- ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
38
 
39
  # %%
40
  # Examples for testing
@@ -134,49 +154,27 @@ class StreetNameRecognizer(EntityRecognizer):
134
 
135
  street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
136
 
137
- # %%
138
  # Create a class inheriting from SpacyNlpEngine
139
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
140
  def __init__(self, loaded_spacy_model):
141
  super().__init__()
142
  self.nlp = {"en": loaded_spacy_model}
143
 
144
- # %%
145
- #Load spacy model
146
- try:
147
- import en_core_web_lg
148
- nlp = en_core_web_lg.load()
149
- print("Successfully imported spaCy model")
150
 
151
- except:
152
- download("en_core_web_lg")
153
- nlp = spacy.load("en_core_web_lg")
154
- print("Successfully downloaded and imported spaCy model")
155
-
156
- # try:
157
- # import en_core_web_sm
158
- # nlp = en_core_web_sm.load()
159
- # print("Successfully imported spaCy model")
160
-
161
- # except:
162
- # download("en_core_web_sm")
163
- # nlp = spacy.load("en_core_web_sm")
164
- # print("Successfully downloaded and imported spaCy model")
165
 
166
  # Pass the loaded model to the new LoadedSpacyNlpEngine
167
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
168
 
169
 
170
-
171
- # %%
172
  nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
173
  default_score_threshold=score_threshold,
174
  supported_languages=["en"],
175
  log_decision_process=False,
176
  )
177
 
178
- # %%
179
  nlp_analyser.registry.add_recognizer(street_recogniser)
180
  nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
181
  nlp_analyser.registry.add_recognizer(titles_recogniser)
 
182
 
 
10
  # %%
11
  model_name = "en_core_web_lg" #"en_core_web_trf"
12
  score_threshold = 0.001
13
+ custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
14
+
15
+ #Load spacy model
16
+ try:
17
+ import en_core_web_lg
18
+ nlp = en_core_web_lg.load()
19
+ print("Successfully imported spaCy model")
20
+
21
+ except:
22
+ download(model_name)
23
+ nlp = spacy.load(model_name)
24
+ print("Successfully downloaded and imported spaCy model")
25
 
 
26
  # #### Custom recognisers
27
+ # Allow user to create their own recogniser
28
+ def custom_word_list_recogniser(custom_list:List[str]=[]):
29
+ custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
30
+ custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
31
+ custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
32
+ global_regex_flags=re.DOTALL | re.MULTILINE)
33
+
34
+ return custom_recogniser
35
+
36
+ # Initialise custom recogniser that will be overwritten later
37
+ custom_recogniser = custom_word_list_recogniser()
38
 
 
39
  # Custom title recogniser
40
  titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
41
  titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
42
  titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
43
+ titles_recogniser = PatternRecognizer(supported_entity="TITLES", name="TITLES", patterns = [titles_pattern],
44
  global_regex_flags=re.DOTALL | re.MULTILINE)
45
 
46
  # %%
 
54
  )
55
 
56
  # Define the recognizer with one or more patterns
57
+ ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
58
 
59
  # %%
60
  # Examples for testing
 
154
 
155
  street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
156
 
 
157
  # Create a class inheriting from SpacyNlpEngine
158
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
159
  def __init__(self, loaded_spacy_model):
160
  super().__init__()
161
  self.nlp = {"en": loaded_spacy_model}
162
 
 
 
 
 
 
 
163
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
  # Pass the loaded model to the new LoadedSpacyNlpEngine
166
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
167
 
168
 
 
 
169
  nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
170
  default_score_threshold=score_threshold,
171
  supported_languages=["en"],
172
  log_decision_process=False,
173
  )
174
 
175
+ # Add custom recognisers to nlp_analyser
176
  nlp_analyser.registry.add_recognizer(street_recogniser)
177
  nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
178
  nlp_analyser.registry.add_recognizer(titles_recogniser)
179
+ nlp_analyser.registry.add_recognizer(custom_recogniser)
180