Commit
·
e3365ed
1
Parent(s):
9504619
Started adding in support for custom deny list. Fixed textract call issue. Removed multithreading for now as it mixes up pages
Browse files- app.py +1 -1
- tools/aws_textract.py +19 -6
- tools/custom_image_analyser_engine.py +1 -1
- tools/file_conversion.py +85 -93
- tools/file_redaction.py +36 -14
- tools/load_spacy_model_custom_recognisers.py +27 -29
app.py
CHANGED
@@ -36,7 +36,7 @@ full_comprehend_entity_list.extend(custom_entities)
|
|
36 |
|
37 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
38 |
|
39 |
-
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
40 |
|
41 |
language = 'en'
|
42 |
|
|
|
36 |
|
37 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
38 |
|
39 |
+
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
|
40 |
|
41 |
language = 'en'
|
42 |
|
tools/aws_textract.py
CHANGED
@@ -4,6 +4,7 @@ from typing import List
|
|
4 |
import io
|
5 |
#import json
|
6 |
import pikepdf
|
|
|
7 |
# Example: converting this single page to an image
|
8 |
#from pdf2image import convert_from_bytes
|
9 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
@@ -11,7 +12,7 @@ from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerR
|
|
11 |
def extract_textract_metadata(response):
|
12 |
"""Extracts metadata from an AWS Textract response."""
|
13 |
|
14 |
-
print("Document metadata:", response['DocumentMetadata'])
|
15 |
|
16 |
request_id = response['ResponseMetadata']['RequestId']
|
17 |
pages = response['DocumentMetadata']['Pages']
|
@@ -35,16 +36,28 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
|
|
35 |
print("Cannot connect to AWS Textract")
|
36 |
return [], "" # Return an empty list and an empty string
|
37 |
|
38 |
-
print("Analysing page with AWS Textract")
|
|
|
|
|
39 |
|
40 |
# Redact signatures if specified
|
41 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
42 |
-
print("Analysing document with signature detection")
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
44 |
else:
|
45 |
-
print("Analysing document without signature detection")
|
46 |
# Call detect_document_text to extract plain text
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
# Wrap the response with the page number in the desired format
|
50 |
wrapped_response = {
|
|
|
4 |
import io
|
5 |
#import json
|
6 |
import pikepdf
|
7 |
+
import time
|
8 |
# Example: converting this single page to an image
|
9 |
#from pdf2image import convert_from_bytes
|
10 |
from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
|
|
|
12 |
def extract_textract_metadata(response):
|
13 |
"""Extracts metadata from an AWS Textract response."""
|
14 |
|
15 |
+
#print("Document metadata:", response['DocumentMetadata'])
|
16 |
|
17 |
request_id = response['ResponseMetadata']['RequestId']
|
18 |
pages = response['DocumentMetadata']['Pages']
|
|
|
36 |
print("Cannot connect to AWS Textract")
|
37 |
return [], "" # Return an empty list and an empty string
|
38 |
|
39 |
+
#print("Analysing page with AWS Textract")
|
40 |
+
#print("pdf_page_bytes:", pdf_page_bytes)
|
41 |
+
#print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
|
42 |
|
43 |
# Redact signatures if specified
|
44 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
45 |
+
#print("Analysing document with signature detection")
|
46 |
+
try:
|
47 |
+
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
48 |
+
except Exception as e:
|
49 |
+
print("Textract call failed due to:", e, "trying again in 5 seconds.")
|
50 |
+
time.sleep(5)
|
51 |
+
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
52 |
else:
|
53 |
+
#print("Analysing document without signature detection")
|
54 |
# Call detect_document_text to extract plain text
|
55 |
+
try:
|
56 |
+
response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
|
57 |
+
except Exception as e:
|
58 |
+
print("Textract call failed due to:", e, "trying again in 5 seconds.")
|
59 |
+
time.sleep(5)
|
60 |
+
response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
|
61 |
|
62 |
# Wrap the response with the page number in the desired format
|
63 |
wrapped_response = {
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -520,7 +520,7 @@ class CustomImageAnalyzerEngine:
|
|
520 |
)
|
521 |
|
522 |
except Exception as e:
|
523 |
-
print(e)
|
524 |
time.sleep(3)
|
525 |
response = comprehend_client.detect_pii_entities(
|
526 |
Text=current_batch,
|
|
|
520 |
)
|
521 |
|
522 |
except Exception as e:
|
523 |
+
print("AWS Comprehend call failed due to:", e, "waiting three seconds to try again.")
|
524 |
time.sleep(3)
|
525 |
response = comprehend_client.detect_pii_entities(
|
526 |
Text=current_batch,
|
tools/file_conversion.py
CHANGED
@@ -48,127 +48,119 @@ def is_pdf(filename):
|
|
48 |
|
49 |
|
50 |
|
51 |
-
def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
|
59 |
-
|
60 |
-
|
61 |
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
|
78 |
-
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
|
92 |
-
|
93 |
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
# Example usage
|
112 |
-
if __name__ == "__main__":
|
113 |
-
pdf_path = "example.pdf"
|
114 |
-
image_dpi = 200
|
115 |
-
output_images = convert_pdf_to_images(pdf_path, image_dpi=image_dpi, num_threads=8)
|
116 |
-
print("Images saved:", output_images)
|
117 |
-
|
118 |
|
119 |
-
|
120 |
|
121 |
-
|
122 |
|
123 |
-
#
|
124 |
-
|
125 |
-
|
126 |
|
127 |
-
|
128 |
|
129 |
-
#
|
130 |
-
#
|
131 |
-
|
132 |
|
133 |
-
#
|
134 |
|
135 |
-
|
136 |
|
137 |
-
#
|
138 |
-
|
139 |
|
140 |
-
#
|
141 |
-
|
142 |
|
143 |
-
#
|
144 |
-
|
145 |
-
#
|
146 |
-
|
147 |
|
148 |
-
|
149 |
-
|
150 |
|
151 |
-
|
152 |
|
153 |
-
#
|
154 |
-
|
155 |
|
156 |
-
|
157 |
|
158 |
-
#
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
|
163 |
-
#
|
164 |
-
#
|
165 |
|
166 |
-
|
167 |
|
168 |
-
|
169 |
-
#
|
170 |
|
171 |
-
|
172 |
|
173 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
174 |
def process_file(file_path:str):
|
|
|
48 |
|
49 |
|
50 |
|
51 |
+
# def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> str:
|
52 |
+
# """
|
53 |
+
# Convert a single page of a PDF to an image and save it as a PNG.
|
54 |
+
# Returns the path to the saved image.
|
55 |
+
# """
|
56 |
+
# try:
|
57 |
+
# out_path = f"{pdf_path}_{page_num}.png"
|
58 |
|
59 |
+
# # Ensure the directory exists
|
60 |
+
# os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
61 |
|
62 |
+
# # Check if the image already exists
|
63 |
+
# if os.path.exists(out_path):
|
64 |
+
# # Load the existing image
|
65 |
+
# print(f"Loading existing image for page {page_num + 1}")
|
66 |
+
# image = Image.open(out_path)
|
67 |
+
# else:
|
68 |
+
# # Convert the page to an image
|
69 |
+
# print(f"Converting page {page_num + 1}")
|
70 |
+
# image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
71 |
+
# dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
72 |
+
# image = image_l[0]
|
73 |
|
74 |
+
# # Convert to greyscale
|
75 |
+
# image = image.convert("L")
|
76 |
+
# image.save(out_path, format="PNG")
|
77 |
|
78 |
+
# return out_path
|
79 |
|
80 |
+
# except Exception as e:
|
81 |
+
# print(f"Error processing page {page_num + 1}: {e}")
|
82 |
+
# return None
|
83 |
+
|
84 |
+
# def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
|
85 |
+
# """
|
86 |
+
# Convert pages of a PDF to images using multithreading.
|
87 |
+
# """
|
88 |
+
# # Get the number of pages in the PDF
|
89 |
+
# page_count = pdfinfo_from_path(pdf_path)['Pages']
|
90 |
+
# print(f"Number of pages in PDF: {page_count}")
|
91 |
|
92 |
+
# images = []
|
93 |
|
94 |
+
# # Use ThreadPoolExecutor to process pages in parallel
|
95 |
+
# with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
96 |
+
# futures = []
|
97 |
+
# for page_num in range(page_min, page_count):
|
98 |
+
# futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
|
99 |
|
100 |
+
# # Display progress using tqdm
|
101 |
+
# for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
|
102 |
+
# result = future.result()
|
103 |
+
# if result:
|
104 |
+
# images.append(result)
|
105 |
+
# else:
|
106 |
+
# print("A page failed to process.")
|
107 |
|
108 |
+
# print("PDF has been converted to images.")
|
109 |
+
# return images
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
+
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
112 |
|
113 |
+
print("pdf_path in convert_pdf_to_images:", pdf_path)
|
114 |
|
115 |
+
# Get the number of pages in the PDF
|
116 |
+
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
117 |
+
print("Number of pages in PDF: ", str(page_count))
|
118 |
|
119 |
+
images = []
|
120 |
|
121 |
+
# Open the PDF file
|
122 |
+
#for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
|
123 |
+
for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
|
124 |
|
125 |
+
#print("page_num in convert_pdf_to_images:", page_num)
|
126 |
|
127 |
+
print("Converting page: ", str(page_num + 1))
|
128 |
|
129 |
+
# Convert one page to image
|
130 |
+
out_path = pdf_path + "_" + str(page_num) + ".png"
|
131 |
|
132 |
+
# Ensure the directory exists
|
133 |
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
134 |
|
135 |
+
# Check if the image already exists
|
136 |
+
if os.path.exists(out_path):
|
137 |
+
#print(f"Loading existing image from {out_path}.")
|
138 |
+
image = Image.open(out_path) # Load the existing image
|
139 |
|
140 |
+
else:
|
141 |
+
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
142 |
|
143 |
+
image = image_l[0]
|
144 |
|
145 |
+
# Convert to greyscale
|
146 |
+
image = image.convert("L")
|
147 |
|
148 |
+
image.save(out_path, format="PNG") # Save the new image
|
149 |
|
150 |
+
# If no images are returned, break the loop
|
151 |
+
if not image:
|
152 |
+
print("Conversion of page", str(page_num), "to file failed.")
|
153 |
+
break
|
154 |
|
155 |
+
# print("Conversion of page", str(page_num), "to file succeeded.")
|
156 |
+
# print("image:", image)
|
157 |
|
158 |
+
images.append(out_path)
|
159 |
|
160 |
+
print("PDF has been converted to images.")
|
161 |
+
# print("Images:", images)
|
162 |
|
163 |
+
return images
|
164 |
|
165 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
166 |
def process_file(file_path:str):
|
tools/file_redaction.py
CHANGED
@@ -26,14 +26,14 @@ from presidio_analyzer import RecognizerResult
|
|
26 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
27 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
28 |
from tools.file_conversion import process_file, image_dpi
|
29 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
|
30 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
31 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
32 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
33 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
34 |
|
35 |
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
36 |
-
page_break_value = get_or_create_env_var('page_break_value', '
|
37 |
print(f'The value of page_break_value is {page_break_value}')
|
38 |
|
39 |
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
@@ -526,14 +526,14 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
|
526 |
scale_width = image_page_width / mediabox_width
|
527 |
scale_height = image_page_height / mediabox_height
|
528 |
|
529 |
-
print("scale_width:", scale_width)
|
530 |
-
print("scale_height:", scale_height)
|
531 |
|
532 |
rect_to_mediabox_x_scale = mediabox_width / rect_width
|
533 |
rect_to_mediabox_y_scale = mediabox_height / rect_height
|
534 |
|
535 |
-
print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
|
536 |
-
print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
|
537 |
|
538 |
# Adjust coordinates based on scaling factors
|
539 |
x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
|
@@ -815,8 +815,10 @@ def redact_image_pdf(file_path:str,
|
|
815 |
pymupdf_doc = [],
|
816 |
pii_identification_method:str="Local",
|
817 |
comprehend_query_number:int=0,
|
818 |
-
comprehend_client="",
|
819 |
-
textract_client="",
|
|
|
|
|
820 |
page_break_val:int=int(page_break_value),
|
821 |
logging_file_paths:List=[],
|
822 |
max_time:int=int(max_time_value),
|
@@ -847,6 +849,8 @@ def redact_image_pdf(file_path:str,
|
|
847 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
848 |
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
849 |
- textract_client (optional): A connection to the AWS Textract service via the boto3 package.
|
|
|
|
|
850 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
851 |
- logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
|
852 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
@@ -855,10 +859,19 @@ def redact_image_pdf(file_path:str,
|
|
855 |
The function returns a fully or partially-redacted PDF document.
|
856 |
'''
|
857 |
file_name = get_file_path_end(file_path)
|
858 |
-
fill = (0, 0, 0) # Fill colour
|
859 |
-
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
860 |
comprehend_query_number_new = 0
|
861 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
862 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
863 |
print("Connection to AWS Comprehend service unsuccessful.")
|
864 |
|
@@ -913,8 +926,7 @@ def redact_image_pdf(file_path:str,
|
|
913 |
image = prepared_pdf_file_paths[page_no]#.copy()
|
914 |
#print("image:", image)
|
915 |
except Exception as e:
|
916 |
-
print("Could not redact page:", reported_page_number, "due to:")
|
917 |
-
print(e)
|
918 |
continue
|
919 |
|
920 |
image_annotations = {"image": image, "boxes": []}
|
@@ -975,7 +987,7 @@ def redact_image_pdf(file_path:str,
|
|
975 |
|
976 |
if not page_exists: # If the page does not exist, analyze again
|
977 |
print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
978 |
-
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, handwrite_signature_checkbox) # Analyse page with Textract
|
979 |
|
980 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
981 |
if "pages" not in existing_data:
|
@@ -1405,6 +1417,8 @@ def redact_text_pdf(
|
|
1405 |
pii_identification_method: str = "Local",
|
1406 |
comprehend_query_number:int = 0,
|
1407 |
comprehend_client="",
|
|
|
|
|
1408 |
page_break_val: int = int(page_break_value), # Value for page break
|
1409 |
max_time: int = int(max_time_value),
|
1410 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
@@ -1431,7 +1445,9 @@ def redact_text_pdf(
|
|
1431 |
- pymupdf_doc: List of PyMuPDF documents
|
1432 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
1433 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
1434 |
-
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
|
|
|
|
1435 |
- page_break_val: Value for page break
|
1436 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1437 |
- progress: Progress tracking object
|
@@ -1441,6 +1457,12 @@ def redact_text_pdf(
|
|
1441 |
print("Connection to AWS Comprehend service not found.")
|
1442 |
|
1443 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
|
|
|
|
|
|
|
|
|
|
|
|
1444 |
|
1445 |
tic = time.perf_counter()
|
1446 |
|
|
|
26 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
27 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
28 |
from tools.file_conversion import process_file, image_dpi
|
29 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
30 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
31 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
32 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
33 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
34 |
|
35 |
# Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
|
36 |
+
page_break_value = get_or_create_env_var('page_break_value', '50000')
|
37 |
print(f'The value of page_break_value is {page_break_value}')
|
38 |
|
39 |
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
|
|
526 |
scale_width = image_page_width / mediabox_width
|
527 |
scale_height = image_page_height / mediabox_height
|
528 |
|
529 |
+
#print("scale_width:", scale_width)
|
530 |
+
#print("scale_height:", scale_height)
|
531 |
|
532 |
rect_to_mediabox_x_scale = mediabox_width / rect_width
|
533 |
rect_to_mediabox_y_scale = mediabox_height / rect_height
|
534 |
|
535 |
+
#print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
|
536 |
+
#print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
|
537 |
|
538 |
# Adjust coordinates based on scaling factors
|
539 |
x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
|
|
|
815 |
pymupdf_doc = [],
|
816 |
pii_identification_method:str="Local",
|
817 |
comprehend_query_number:int=0,
|
818 |
+
comprehend_client:str="",
|
819 |
+
textract_client:str="",
|
820 |
+
custom_recogniser_word_list:List[str]=[],
|
821 |
+
redact_whole_page_list:List[str]=[],
|
822 |
page_break_val:int=int(page_break_value),
|
823 |
logging_file_paths:List=[],
|
824 |
max_time:int=int(max_time_value),
|
|
|
849 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
850 |
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
851 |
- textract_client (optional): A connection to the AWS Textract service via the boto3 package.
|
852 |
+
- custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
|
853 |
+
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
854 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
855 |
- logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
|
856 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
|
|
859 |
The function returns a fully or partially-redacted PDF document.
|
860 |
'''
|
861 |
file_name = get_file_path_end(file_path)
|
862 |
+
fill = (0, 0, 0) # Fill colour for redactions
|
|
|
863 |
comprehend_query_number_new = 0
|
864 |
|
865 |
+
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
866 |
+
if custom_recogniser_word_list:
|
867 |
+
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
868 |
+
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
869 |
+
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
870 |
+
|
871 |
+
|
872 |
+
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
873 |
+
|
874 |
+
|
875 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
876 |
print("Connection to AWS Comprehend service unsuccessful.")
|
877 |
|
|
|
926 |
image = prepared_pdf_file_paths[page_no]#.copy()
|
927 |
#print("image:", image)
|
928 |
except Exception as e:
|
929 |
+
print("Could not redact page:", reported_page_number, "due to:", e)
|
|
|
930 |
continue
|
931 |
|
932 |
image_annotations = {"image": image, "boxes": []}
|
|
|
987 |
|
988 |
if not page_exists: # If the page does not exist, analyze again
|
989 |
print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
990 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
991 |
|
992 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
993 |
if "pages" not in existing_data:
|
|
|
1417 |
pii_identification_method: str = "Local",
|
1418 |
comprehend_query_number:int = 0,
|
1419 |
comprehend_client="",
|
1420 |
+
custom_recogniser_word_list:List[str]=[],
|
1421 |
+
redact_whole_page_list:List[str]=[],
|
1422 |
page_break_val: int = int(page_break_value), # Value for page break
|
1423 |
max_time: int = int(max_time_value),
|
1424 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
|
1445 |
- pymupdf_doc: List of PyMuPDF documents
|
1446 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
1447 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
1448 |
+
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
1449 |
+
- custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
|
1450 |
+
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
1451 |
- page_break_val: Value for page break
|
1452 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1453 |
- progress: Progress tracking object
|
|
|
1457 |
print("Connection to AWS Comprehend service not found.")
|
1458 |
|
1459 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1460 |
+
|
1461 |
+
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1462 |
+
if custom_recogniser_word_list:
|
1463 |
+
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1464 |
+
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
1465 |
+
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1466 |
|
1467 |
tic = time.perf_counter()
|
1468 |
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -10,17 +10,37 @@ import re
|
|
10 |
# %%
|
11 |
model_name = "en_core_web_lg" #"en_core_web_trf"
|
12 |
score_threshold = 0.001
|
13 |
-
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
-
# %% [markdown]
|
16 |
# #### Custom recognisers
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
-
# %%
|
19 |
# Custom title recogniser
|
20 |
titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
|
21 |
titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
|
22 |
titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
|
23 |
-
titles_recogniser = PatternRecognizer(supported_entity="TITLES", patterns = [titles_pattern],
|
24 |
global_regex_flags=re.DOTALL | re.MULTILINE)
|
25 |
|
26 |
# %%
|
@@ -34,7 +54,7 @@ ukpostcode_pattern = Pattern(
|
|
34 |
)
|
35 |
|
36 |
# Define the recognizer with one or more patterns
|
37 |
-
ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", patterns = [ukpostcode_pattern])
|
38 |
|
39 |
# %%
|
40 |
# Examples for testing
|
@@ -134,49 +154,27 @@ class StreetNameRecognizer(EntityRecognizer):
|
|
134 |
|
135 |
street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
|
136 |
|
137 |
-
# %%
|
138 |
# Create a class inheriting from SpacyNlpEngine
|
139 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
140 |
def __init__(self, loaded_spacy_model):
|
141 |
super().__init__()
|
142 |
self.nlp = {"en": loaded_spacy_model}
|
143 |
|
144 |
-
# %%
|
145 |
-
#Load spacy model
|
146 |
-
try:
|
147 |
-
import en_core_web_lg
|
148 |
-
nlp = en_core_web_lg.load()
|
149 |
-
print("Successfully imported spaCy model")
|
150 |
|
151 |
-
except:
|
152 |
-
download("en_core_web_lg")
|
153 |
-
nlp = spacy.load("en_core_web_lg")
|
154 |
-
print("Successfully downloaded and imported spaCy model")
|
155 |
-
|
156 |
-
# try:
|
157 |
-
# import en_core_web_sm
|
158 |
-
# nlp = en_core_web_sm.load()
|
159 |
-
# print("Successfully imported spaCy model")
|
160 |
-
|
161 |
-
# except:
|
162 |
-
# download("en_core_web_sm")
|
163 |
-
# nlp = spacy.load("en_core_web_sm")
|
164 |
-
# print("Successfully downloaded and imported spaCy model")
|
165 |
|
166 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
167 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
168 |
|
169 |
|
170 |
-
|
171 |
-
# %%
|
172 |
nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
|
173 |
default_score_threshold=score_threshold,
|
174 |
supported_languages=["en"],
|
175 |
log_decision_process=False,
|
176 |
)
|
177 |
|
178 |
-
#
|
179 |
nlp_analyser.registry.add_recognizer(street_recogniser)
|
180 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
181 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
|
|
182 |
|
|
|
10 |
# %%
|
11 |
model_name = "en_core_web_lg" #"en_core_web_trf"
|
12 |
score_threshold = 0.001
|
13 |
+
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
14 |
+
|
15 |
+
#Load spacy model
|
16 |
+
try:
|
17 |
+
import en_core_web_lg
|
18 |
+
nlp = en_core_web_lg.load()
|
19 |
+
print("Successfully imported spaCy model")
|
20 |
+
|
21 |
+
except:
|
22 |
+
download(model_name)
|
23 |
+
nlp = spacy.load(model_name)
|
24 |
+
print("Successfully downloaded and imported spaCy model")
|
25 |
|
|
|
26 |
# #### Custom recognisers
|
27 |
+
# Allow user to create their own recogniser
|
28 |
+
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
29 |
+
custom_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(term)}" for term in custom_list) + '\\b'
|
30 |
+
custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
|
31 |
+
custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
|
32 |
+
global_regex_flags=re.DOTALL | re.MULTILINE)
|
33 |
+
|
34 |
+
return custom_recogniser
|
35 |
+
|
36 |
+
# Initialise custom recogniser that will be overwritten later
|
37 |
+
custom_recogniser = custom_word_list_recogniser()
|
38 |
|
|
|
39 |
# Custom title recogniser
|
40 |
titles_list = ["Sir", "Ma'am", "Madam", "Mr", "Mr.", "Mrs", "Mrs.", "Ms", "Ms.", "Miss", "Dr", "Dr.", "Professor"]
|
41 |
titles_regex = '\\b' + '\\b|\\b'.join(rf"{re.escape(title)}" for title in titles_list) + '\\b'
|
42 |
titles_pattern = Pattern(name="titles_pattern",regex=titles_regex, score = 1)
|
43 |
+
titles_recogniser = PatternRecognizer(supported_entity="TITLES", name="TITLES", patterns = [titles_pattern],
|
44 |
global_regex_flags=re.DOTALL | re.MULTILINE)
|
45 |
|
46 |
# %%
|
|
|
54 |
)
|
55 |
|
56 |
# Define the recognizer with one or more patterns
|
57 |
+
ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
|
58 |
|
59 |
# %%
|
60 |
# Examples for testing
|
|
|
154 |
|
155 |
street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
|
156 |
|
|
|
157 |
# Create a class inheriting from SpacyNlpEngine
|
158 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
159 |
def __init__(self, loaded_spacy_model):
|
160 |
super().__init__()
|
161 |
self.nlp = {"en": loaded_spacy_model}
|
162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
|
165 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
166 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
167 |
|
168 |
|
|
|
|
|
169 |
nlp_analyser = AnalyzerEngine(nlp_engine=loaded_nlp_engine,
|
170 |
default_score_threshold=score_threshold,
|
171 |
supported_languages=["en"],
|
172 |
log_decision_process=False,
|
173 |
)
|
174 |
|
175 |
+
# Add custom recognisers to nlp_analyser
|
176 |
nlp_analyser.registry.add_recognizer(street_recogniser)
|
177 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
178 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
179 |
+
nlp_analyser.registry.add_recognizer(custom_recogniser)
|
180 |
|