Commit
·
f0c28d7
1
Parent(s):
e3365ed
Updated packages. Reinstituted multithreading with page load, now with order protected. Smaller spacy model used for speed. Textract calls should now be faster
Browse files- requirements.txt +6 -6
- tools/aws_textract.py +3 -3
- tools/file_conversion.py +110 -123
- tools/file_redaction.py +97 -36
- tools/helper_functions.py +8 -3
- tools/load_spacy_model_custom_recognisers.py +4 -4
requirements.txt
CHANGED
|
@@ -7,12 +7,12 @@ presidio_anonymizer==2.2.355
|
|
| 7 |
presidio-image-redactor==0.0.53
|
| 8 |
pikepdf==8.15.1
|
| 9 |
pandas==2.2.3
|
| 10 |
-
spacy==3.
|
| 11 |
-
en_core_web_lg @ https://github.com/explosion/spacy
|
| 12 |
-
|
| 13 |
-
gradio==5.
|
| 14 |
-
boto3==1.35.
|
| 15 |
-
pyarrow==
|
| 16 |
openpyxl==3.1.2
|
| 17 |
Faker==22.2.0
|
| 18 |
gradio_image_annotation==0.2.5
|
|
|
|
| 7 |
presidio-image-redactor==0.0.53
|
| 8 |
pikepdf==8.15.1
|
| 9 |
pandas==2.2.3
|
| 10 |
+
spacy==3.8.3
|
| 11 |
+
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
| 12 |
+
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
| 13 |
+
gradio==5.9.0
|
| 14 |
+
boto3==1.35.83
|
| 15 |
+
pyarrow==18.1.0
|
| 16 |
openpyxl==3.1.2
|
| 17 |
Faker==22.2.0
|
| 18 |
gradio_image_annotation==0.2.5
|
tools/aws_textract.py
CHANGED
|
@@ -46,8 +46,8 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
|
|
| 46 |
try:
|
| 47 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
| 48 |
except Exception as e:
|
| 49 |
-
print("Textract call failed due to:", e, "trying again in
|
| 50 |
-
time.sleep(
|
| 51 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
| 52 |
else:
|
| 53 |
#print("Analysing document without signature detection")
|
|
@@ -185,7 +185,7 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
|
| 185 |
|
| 186 |
if recogniser_result not in handwriting:
|
| 187 |
handwriting.append(recogniser_result)
|
| 188 |
-
print("Handwriting found:", handwriting[-1])
|
| 189 |
|
| 190 |
# If handwriting or signature, add to bounding box
|
| 191 |
|
|
|
|
| 46 |
try:
|
| 47 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
| 48 |
except Exception as e:
|
| 49 |
+
print("Textract call failed due to:", e, "trying again in 3 seconds.")
|
| 50 |
+
time.sleep(3)
|
| 51 |
response = client.analyze_document(Document={'Bytes': pdf_page_bytes}, FeatureTypes=["SIGNATURES"])
|
| 52 |
else:
|
| 53 |
#print("Analysing document without signature detection")
|
|
|
|
| 185 |
|
| 186 |
if recogniser_result not in handwriting:
|
| 187 |
handwriting.append(recogniser_result)
|
| 188 |
+
#print("Handwriting found:", handwriting[-1])
|
| 189 |
|
| 190 |
# If handwriting or signature, add to bounding box
|
| 191 |
|
tools/file_conversion.py
CHANGED
|
@@ -48,122 +48,112 @@ def is_pdf(filename):
|
|
| 48 |
|
| 49 |
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
# print(f"Converting page {page_num + 1}")
|
| 70 |
-
# image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
| 71 |
-
# dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
| 72 |
-
# image = image_l[0]
|
| 73 |
-
|
| 74 |
-
# # Convert to greyscale
|
| 75 |
-
# image = image.convert("L")
|
| 76 |
-
# image.save(out_path, format="PNG")
|
| 77 |
-
|
| 78 |
-
# return out_path
|
| 79 |
-
|
| 80 |
-
# except Exception as e:
|
| 81 |
-
# print(f"Error processing page {page_num + 1}: {e}")
|
| 82 |
-
# return None
|
| 83 |
-
|
| 84 |
-
# def convert_pdf_to_images(pdf_path: str, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
|
| 85 |
-
# """
|
| 86 |
-
# Convert pages of a PDF to images using multithreading.
|
| 87 |
-
# """
|
| 88 |
-
# # Get the number of pages in the PDF
|
| 89 |
-
# page_count = pdfinfo_from_path(pdf_path)['Pages']
|
| 90 |
-
# print(f"Number of pages in PDF: {page_count}")
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
#
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
# print("A page failed to process.")
|
| 107 |
|
| 108 |
-
#
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
-
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
| 112 |
|
| 113 |
-
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
|
| 119 |
-
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
|
| 125 |
-
|
| 126 |
|
| 127 |
-
|
| 128 |
|
| 129 |
-
|
| 130 |
-
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
| 142 |
|
| 143 |
-
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
|
| 148 |
-
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
|
| 155 |
-
|
| 156 |
-
|
| 157 |
|
| 158 |
-
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
|
| 163 |
-
|
| 164 |
|
| 165 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
| 166 |
-
def process_file(file_path:str):
|
| 167 |
# Get the file extension
|
| 168 |
file_extension = os.path.splitext(file_path)[1].lower()
|
| 169 |
|
|
@@ -178,7 +168,7 @@ def process_file(file_path:str):
|
|
| 178 |
elif file_extension == '.pdf':
|
| 179 |
print(f"{file_path} is a PDF file. Converting to image set")
|
| 180 |
# Run your function for processing PDF files here
|
| 181 |
-
img_object = convert_pdf_to_images(file_path)
|
| 182 |
|
| 183 |
else:
|
| 184 |
print(f"{file_path} is not an image or PDF file.")
|
|
@@ -195,7 +185,7 @@ def get_input_file_names(file_input):
|
|
| 195 |
file_name_with_extension = ""
|
| 196 |
full_file_name = ""
|
| 197 |
|
| 198 |
-
print("file_input in input file names:", file_input)
|
| 199 |
if isinstance(file_input, dict):
|
| 200 |
file_input = os.path.abspath(file_input["name"])
|
| 201 |
|
|
@@ -222,8 +212,6 @@ def get_input_file_names(file_input):
|
|
| 222 |
|
| 223 |
all_relevant_files_str = ", ".join(all_relevant_files)
|
| 224 |
|
| 225 |
-
print("all_relevant_files_str:", all_relevant_files_str)
|
| 226 |
-
|
| 227 |
return all_relevant_files_str, file_name_with_extension, full_file_name
|
| 228 |
|
| 229 |
def prepare_image_or_pdf(
|
|
@@ -253,6 +241,7 @@ def prepare_image_or_pdf(
|
|
| 253 |
out_message (List[str]): List to store output messages.
|
| 254 |
first_loop_state (bool): Flag indicating if this is the first iteration.
|
| 255 |
number_of_pages (int): integer indicating the number of pages in the document
|
|
|
|
| 256 |
all_annotations_object(List of annotation objects): All annotations for current document
|
| 257 |
prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
| 258 |
progress (Progress): Progress tracker for the operation.
|
|
@@ -352,11 +341,11 @@ def prepare_image_or_pdf(
|
|
| 352 |
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
| 353 |
in_redact_method = tesseract_ocr_option
|
| 354 |
|
| 355 |
-
|
| 356 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
| 357 |
if file_path.endswith(".json"):
|
| 358 |
|
| 359 |
if prepare_for_review == True:
|
|
|
|
| 360 |
if isinstance(file_path, str):
|
| 361 |
with open(file_path, 'r') as json_file:
|
| 362 |
all_annotations_object = json.load(json_file)
|
|
@@ -372,11 +361,12 @@ def prepare_image_or_pdf(
|
|
| 372 |
]
|
| 373 |
image_file_paths_pages = [int(i) for i in image_file_paths_pages]
|
| 374 |
|
| 375 |
-
|
| 376 |
-
# If PDF pages have been converted to image files, replace the current image paths in the json to this
|
| 377 |
if image_file_paths:
|
|
|
|
| 378 |
for i, annotation in enumerate(all_annotations_object):
|
| 379 |
annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
|
|
|
|
| 380 |
|
| 381 |
# Check if the annotation page number exists in the image file paths pages
|
| 382 |
if annotation_page_number in image_file_paths_pages:
|
|
@@ -385,7 +375,7 @@ def prepare_image_or_pdf(
|
|
| 385 |
correct_image_page = annotation_page_number
|
| 386 |
annotation["image"] = image_file_paths[correct_image_page]
|
| 387 |
else:
|
| 388 |
-
print("Page not found.")
|
| 389 |
|
| 390 |
#print("all_annotations_object:", all_annotations_object)
|
| 391 |
|
|
@@ -404,30 +394,24 @@ def prepare_image_or_pdf(
|
|
| 404 |
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 405 |
continue
|
| 406 |
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
elif in_redact_method == text_ocr_option:
|
| 424 |
-
if is_pdf(file_path) == False:
|
| 425 |
-
out_message = "Please upload a PDF file for text analysis."
|
| 426 |
-
print(out_message)
|
| 427 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
| 428 |
-
|
| 429 |
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
| 430 |
-
image_file_path = process_file(file_path)
|
| 431 |
|
| 432 |
converted_file_paths.append(converted_file_path)
|
| 433 |
image_file_paths.extend(image_file_path)
|
|
@@ -453,7 +437,10 @@ def prepare_image_or_pdf(
|
|
| 453 |
out_message.append(out_time)
|
| 454 |
out_message_out = '\n'.join(out_message)
|
| 455 |
|
|
|
|
| 456 |
number_of_pages = len(image_file_paths)
|
|
|
|
|
|
|
| 457 |
|
| 458 |
|
| 459 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
|
| 51 |
+
def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple[int, str]:
|
| 52 |
+
try:
|
| 53 |
+
out_path = f"{pdf_path}_{page_num}.png"
|
| 54 |
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 55 |
+
if os.path.exists(out_path):
|
| 56 |
+
print(f"Loading existing image for page {page_num + 1}")
|
| 57 |
+
image = Image.open(out_path)
|
| 58 |
+
else:
|
| 59 |
+
print(f"Converting page {page_num + 1}")
|
| 60 |
+
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
| 61 |
+
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
| 62 |
+
image = image_l[0]
|
| 63 |
+
image = image.convert("L")
|
| 64 |
+
image.save(out_path, format="PNG")
|
| 65 |
+
return page_num, out_path
|
| 66 |
+
except Exception as e:
|
| 67 |
+
print(f"Error processing page {page_num + 1}: {e}")
|
| 68 |
+
return page_num, None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
+
def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
|
| 71 |
+
|
| 72 |
+
# If preparing for review, just load the first page
|
| 73 |
+
if prepare_for_review == True:
|
| 74 |
+
page_count = pdfinfo_from_path(pdf_path)['Pages'] #1
|
| 75 |
+
else:
|
| 76 |
+
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
| 77 |
+
|
| 78 |
+
print(f"Number of pages in PDF: {page_count}")
|
| 79 |
+
|
| 80 |
+
results = []
|
| 81 |
+
with ThreadPoolExecutor(max_workers=num_threads) as executor:
|
| 82 |
+
futures = []
|
| 83 |
+
for page_num in range(page_min, page_count):
|
| 84 |
+
futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
|
| 85 |
|
| 86 |
+
for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
|
| 87 |
+
page_num, result = future.result()
|
| 88 |
+
if result:
|
| 89 |
+
results.append((page_num, result))
|
| 90 |
+
else:
|
| 91 |
+
print(f"Page {page_num + 1} failed to process.")
|
|
|
|
| 92 |
|
| 93 |
+
# Sort results by page number
|
| 94 |
+
results.sort(key=lambda x: x[0])
|
| 95 |
+
images = [result[1] for result in results]
|
| 96 |
+
|
| 97 |
+
print("PDF has been converted to images.")
|
| 98 |
+
return images
|
| 99 |
+
|
| 100 |
|
| 101 |
+
# def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
| 102 |
|
| 103 |
+
# print("pdf_path in convert_pdf_to_images:", pdf_path)
|
| 104 |
|
| 105 |
+
# # Get the number of pages in the PDF
|
| 106 |
+
# page_count = pdfinfo_from_path(pdf_path)['Pages']
|
| 107 |
+
# print("Number of pages in PDF: ", str(page_count))
|
| 108 |
|
| 109 |
+
# images = []
|
| 110 |
|
| 111 |
+
# # Open the PDF file
|
| 112 |
+
# #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
|
| 113 |
+
# for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
|
| 114 |
|
| 115 |
+
# #print("page_num in convert_pdf_to_images:", page_num)
|
| 116 |
|
| 117 |
+
# print("Converting page: ", str(page_num + 1))
|
| 118 |
|
| 119 |
+
# # Convert one page to image
|
| 120 |
+
# out_path = pdf_path + "_" + str(page_num) + ".png"
|
| 121 |
|
| 122 |
+
# # Ensure the directory exists
|
| 123 |
+
# os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
| 124 |
|
| 125 |
+
# # Check if the image already exists
|
| 126 |
+
# if os.path.exists(out_path):
|
| 127 |
+
# #print(f"Loading existing image from {out_path}.")
|
| 128 |
+
# image = Image.open(out_path) # Load the existing image
|
| 129 |
|
| 130 |
+
# else:
|
| 131 |
+
# image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
| 132 |
|
| 133 |
+
# image = image_l[0]
|
| 134 |
|
| 135 |
+
# # Convert to greyscale
|
| 136 |
+
# image = image.convert("L")
|
| 137 |
|
| 138 |
+
# image.save(out_path, format="PNG") # Save the new image
|
| 139 |
|
| 140 |
+
# # If no images are returned, break the loop
|
| 141 |
+
# if not image:
|
| 142 |
+
# print("Conversion of page", str(page_num), "to file failed.")
|
| 143 |
+
# break
|
| 144 |
|
| 145 |
+
# # print("Conversion of page", str(page_num), "to file succeeded.")
|
| 146 |
+
# # print("image:", image)
|
| 147 |
|
| 148 |
+
# images.append(out_path)
|
| 149 |
|
| 150 |
+
# print("PDF has been converted to images.")
|
| 151 |
+
# # print("Images:", images)
|
| 152 |
|
| 153 |
+
# return images
|
| 154 |
|
| 155 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
| 156 |
+
def process_file(file_path:str, prepare_for_review:bool=False):
|
| 157 |
# Get the file extension
|
| 158 |
file_extension = os.path.splitext(file_path)[1].lower()
|
| 159 |
|
|
|
|
| 168 |
elif file_extension == '.pdf':
|
| 169 |
print(f"{file_path} is a PDF file. Converting to image set")
|
| 170 |
# Run your function for processing PDF files here
|
| 171 |
+
img_object = convert_pdf_to_images(file_path, prepare_for_review)
|
| 172 |
|
| 173 |
else:
|
| 174 |
print(f"{file_path} is not an image or PDF file.")
|
|
|
|
| 185 |
file_name_with_extension = ""
|
| 186 |
full_file_name = ""
|
| 187 |
|
| 188 |
+
#print("file_input in input file names:", file_input)
|
| 189 |
if isinstance(file_input, dict):
|
| 190 |
file_input = os.path.abspath(file_input["name"])
|
| 191 |
|
|
|
|
| 212 |
|
| 213 |
all_relevant_files_str = ", ".join(all_relevant_files)
|
| 214 |
|
|
|
|
|
|
|
| 215 |
return all_relevant_files_str, file_name_with_extension, full_file_name
|
| 216 |
|
| 217 |
def prepare_image_or_pdf(
|
|
|
|
| 241 |
out_message (List[str]): List to store output messages.
|
| 242 |
first_loop_state (bool): Flag indicating if this is the first iteration.
|
| 243 |
number_of_pages (int): integer indicating the number of pages in the document
|
| 244 |
+
current_loop_page_number (int): Current number of loop
|
| 245 |
all_annotations_object(List of annotation objects): All annotations for current document
|
| 246 |
prepare_for_review(bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
| 247 |
progress (Progress): Progress tracker for the operation.
|
|
|
|
| 341 |
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
| 342 |
in_redact_method = tesseract_ocr_option
|
| 343 |
|
|
|
|
| 344 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
| 345 |
if file_path.endswith(".json"):
|
| 346 |
|
| 347 |
if prepare_for_review == True:
|
| 348 |
+
print("Preparing file for review")
|
| 349 |
if isinstance(file_path, str):
|
| 350 |
with open(file_path, 'r') as json_file:
|
| 351 |
all_annotations_object = json.load(json_file)
|
|
|
|
| 361 |
]
|
| 362 |
image_file_paths_pages = [int(i) for i in image_file_paths_pages]
|
| 363 |
|
| 364 |
+
# If PDF pages have been converted to image files, replace the current image paths in the json to this.
|
|
|
|
| 365 |
if image_file_paths:
|
| 366 |
+
|
| 367 |
for i, annotation in enumerate(all_annotations_object):
|
| 368 |
annotation_page_number = int(re.search(r'_(\d+)\.png$', annotation["image"]).group(1))
|
| 369 |
+
#print("Annotation page number:", annotation_page_number)
|
| 370 |
|
| 371 |
# Check if the annotation page number exists in the image file paths pages
|
| 372 |
if annotation_page_number in image_file_paths_pages:
|
|
|
|
| 375 |
correct_image_page = annotation_page_number
|
| 376 |
annotation["image"] = image_file_paths[correct_image_page]
|
| 377 |
else:
|
| 378 |
+
print("Page", annotation_page_number, "image file not found.")
|
| 379 |
|
| 380 |
#print("all_annotations_object:", all_annotations_object)
|
| 381 |
|
|
|
|
| 394 |
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 395 |
continue
|
| 396 |
|
| 397 |
+
# Must be a pdf or image at this point
|
| 398 |
+
else:
|
| 399 |
+
|
| 400 |
+
# Convert pdf/image file to correct format for redaction
|
| 401 |
+
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
| 402 |
+
if is_pdf_or_image(file_path) == False:
|
| 403 |
+
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
| 404 |
+
print(out_message)
|
| 405 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
| 406 |
+
|
| 407 |
+
elif in_redact_method == text_ocr_option:
|
| 408 |
+
if is_pdf(file_path) == False:
|
| 409 |
+
out_message = "Please upload a PDF file for text analysis."
|
| 410 |
+
print(out_message)
|
| 411 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
| 412 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
converted_file_path = file_path # Pikepdf works with the basic unconverted pdf file
|
| 414 |
+
image_file_path = process_file(file_path, prepare_for_review)
|
| 415 |
|
| 416 |
converted_file_paths.append(converted_file_path)
|
| 417 |
image_file_paths.extend(image_file_path)
|
|
|
|
| 437 |
out_message.append(out_time)
|
| 438 |
out_message_out = '\n'.join(out_message)
|
| 439 |
|
| 440 |
+
if prepare_for_review == False:
|
| 441 |
number_of_pages = len(image_file_paths)
|
| 442 |
+
else:
|
| 443 |
+
number_of_pages = len(all_annotations_object)
|
| 444 |
|
| 445 |
|
| 446 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
tools/file_redaction.py
CHANGED
|
@@ -689,8 +689,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
| 689 |
merged_bboxes = []
|
| 690 |
grouped_bboxes = defaultdict(list)
|
| 691 |
|
| 692 |
-
print("handwrite_signature_checkbox:", handwrite_signature_checkbox)
|
| 693 |
-
|
| 694 |
# Process signature and handwriting results
|
| 695 |
if signature_recogniser_results or handwriting_recogniser_results:
|
| 696 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
|
@@ -906,6 +904,30 @@ def redact_image_pdf(file_path:str,
|
|
| 906 |
if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
| 907 |
elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
| 908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
if current_loop_page == 0: page_loop_start = 0
|
| 910 |
else: page_loop_start = current_loop_page
|
| 911 |
|
|
@@ -919,7 +941,7 @@ def redact_image_pdf(file_path:str,
|
|
| 919 |
page_break_return = False
|
| 920 |
|
| 921 |
reported_page_number = str(page_no + 1)
|
| 922 |
-
print("Redacting page:", reported_page_number)
|
| 923 |
|
| 924 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
| 925 |
try:
|
|
@@ -962,49 +984,72 @@ def redact_image_pdf(file_path:str,
|
|
| 962 |
image_buffer = io.BytesIO()
|
| 963 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
| 964 |
pdf_page_as_bytes = image_buffer.getvalue()
|
| 965 |
-
|
| 966 |
-
|
| 967 |
-
json_file_path = output_folder + file_name + "_textract.json"
|
| 968 |
-
|
| 969 |
-
if not os.path.exists(json_file_path):
|
| 970 |
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 971 |
logging_file_paths.append(json_file_path)
|
| 972 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
| 973 |
|
| 974 |
-
|
| 975 |
|
| 976 |
-
# Write the updated existing_data back to the JSON file
|
| 977 |
-
with open(json_file_path, 'w') as json_file:
|
| 978 |
-
json.dump(wrapped_text_blocks, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 979 |
else:
|
| 980 |
-
#
|
| 981 |
-
|
| 982 |
-
with open(json_file_path, 'r') as json_file:
|
| 983 |
-
existing_data = json.load(json_file)
|
| 984 |
|
| 985 |
-
|
| 986 |
-
|
|
|
|
| 987 |
|
| 988 |
-
|
| 989 |
-
|
| 990 |
-
|
| 991 |
|
| 992 |
-
|
| 993 |
-
|
| 994 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 995 |
|
| 996 |
-
|
| 997 |
-
existing_data["pages"].append(text_blocks)
|
| 998 |
|
| 999 |
-
# Write the updated existing_data back to the JSON file
|
| 1000 |
-
with open(json_file_path, 'w') as json_file:
|
| 1001 |
-
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 1002 |
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1008 |
|
| 1009 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
| 1010 |
|
|
@@ -1124,6 +1169,11 @@ def redact_image_pdf(file_path:str,
|
|
| 1124 |
|
| 1125 |
annotations_all_pages.append(image_annotations)
|
| 1126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1127 |
current_loop_page += 1
|
| 1128 |
|
| 1129 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
|
@@ -1142,7 +1192,18 @@ def redact_image_pdf(file_path:str,
|
|
| 1142 |
progress.close(_tqdm=progress_bar)
|
| 1143 |
tqdm._instances.clear()
|
| 1144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1145 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1146 |
|
| 1147 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1148 |
|
|
@@ -1675,8 +1736,8 @@ def redact_text_pdf(
|
|
| 1675 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
|
| 1676 |
|
| 1677 |
#print("Did redact_page_with_pymupdf function")
|
| 1678 |
-
|
| 1679 |
-
print("For page number:",
|
| 1680 |
|
| 1681 |
# Write logs
|
| 1682 |
# Create decision process table
|
|
|
|
| 689 |
merged_bboxes = []
|
| 690 |
grouped_bboxes = defaultdict(list)
|
| 691 |
|
|
|
|
|
|
|
| 692 |
# Process signature and handwriting results
|
| 693 |
if signature_recogniser_results or handwriting_recogniser_results:
|
| 694 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
|
|
|
| 904 |
if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
| 905 |
elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
| 906 |
|
| 907 |
+
# If running Textract, check if file already exists. If it does, load in existing data
|
| 908 |
+
# Import results from json and convert
|
| 909 |
+
if analysis_type == textract_option:
|
| 910 |
+
|
| 911 |
+
json_file_path = output_folder + file_name + "_textract.json"
|
| 912 |
+
logging_file_paths.append(json_file_path)
|
| 913 |
+
|
| 914 |
+
if not os.path.exists(json_file_path):
|
| 915 |
+
no_textract_file = True
|
| 916 |
+
print("No existing Textract results file found.")
|
| 917 |
+
existing_data = {}
|
| 918 |
+
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 919 |
+
#logging_file_paths.append(json_file_path)
|
| 920 |
+
#request_metadata = request_metadata + "\n" + new_request_metadata
|
| 921 |
+
#wrapped_text_blocks = {"pages":[text_blocks]}
|
| 922 |
+
else:
|
| 923 |
+
# Open the file and load the JSON data
|
| 924 |
+
no_textract_file = False
|
| 925 |
+
print("Found existing Textract json results file.")
|
| 926 |
+
with open(json_file_path, 'r') as json_file:
|
| 927 |
+
existing_data = json.load(json_file)
|
| 928 |
+
|
| 929 |
+
###
|
| 930 |
+
|
| 931 |
if current_loop_page == 0: page_loop_start = 0
|
| 932 |
else: page_loop_start = current_loop_page
|
| 933 |
|
|
|
|
| 941 |
page_break_return = False
|
| 942 |
|
| 943 |
reported_page_number = str(page_no + 1)
|
| 944 |
+
#print("Redacting page:", reported_page_number)
|
| 945 |
|
| 946 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
| 947 |
try:
|
|
|
|
| 984 |
image_buffer = io.BytesIO()
|
| 985 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
| 986 |
pdf_page_as_bytes = image_buffer.getvalue()
|
| 987 |
+
|
| 988 |
+
if not existing_data:
|
|
|
|
|
|
|
|
|
|
| 989 |
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 990 |
logging_file_paths.append(json_file_path)
|
| 991 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
| 992 |
|
| 993 |
+
existing_data = {"pages":[text_blocks]}
|
| 994 |
|
|
|
|
|
|
|
|
|
|
| 995 |
else:
|
| 996 |
+
# Check if the current reported_page_number exists in the loaded JSON
|
| 997 |
+
page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
|
|
|
|
|
|
|
| 998 |
|
| 999 |
+
if not page_exists: # If the page does not exist, analyze again
|
| 1000 |
+
print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
|
| 1001 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 1002 |
|
| 1003 |
+
# Check if "pages" key exists, if not, initialize it as an empty list
|
| 1004 |
+
if "pages" not in existing_data:
|
| 1005 |
+
existing_data["pages"] = []
|
| 1006 |
|
| 1007 |
+
# Append the new page data
|
| 1008 |
+
existing_data["pages"].append(text_blocks)
|
| 1009 |
+
|
| 1010 |
+
request_metadata = request_metadata + "\n" + new_request_metadata
|
| 1011 |
+
else:
|
| 1012 |
+
# If the page exists, retrieve the data
|
| 1013 |
+
text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
| 1014 |
+
|
| 1015 |
+
|
| 1016 |
+
# if not os.path.exists(json_file_path):
|
| 1017 |
+
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 1018 |
+
# logging_file_paths.append(json_file_path)
|
| 1019 |
+
# request_metadata = request_metadata + "\n" + new_request_metadata
|
| 1020 |
|
| 1021 |
+
# existing_data = {"pages":[text_blocks]}
|
|
|
|
| 1022 |
|
|
|
|
|
|
|
|
|
|
| 1023 |
|
| 1024 |
+
# else:
|
| 1025 |
+
# # Open the file and load the JSON data
|
| 1026 |
+
# print("Found existing Textract json results file.")
|
| 1027 |
+
# with open(json_file_path, 'r') as json_file:
|
| 1028 |
+
# existing_data = json.load(json_file)
|
| 1029 |
+
|
| 1030 |
+
# # Check if the current reported_page_number exists in the loaded JSON
|
| 1031 |
+
# page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
|
| 1032 |
+
|
| 1033 |
+
# if not page_exists: # If the page does not exist, analyze again
|
| 1034 |
+
# print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
| 1035 |
+
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
| 1036 |
+
|
| 1037 |
+
# # Check if "pages" key exists, if not, initialize it as an empty list
|
| 1038 |
+
# if "pages" not in existing_data:
|
| 1039 |
+
# existing_data["pages"] = []
|
| 1040 |
+
|
| 1041 |
+
# # Append the new page data
|
| 1042 |
+
# existing_data["pages"].append(text_blocks)
|
| 1043 |
+
|
| 1044 |
+
# # Write the updated existing_data back to the JSON file
|
| 1045 |
+
# with open(json_file_path, 'w') as json_file:
|
| 1046 |
+
# json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 1047 |
+
|
| 1048 |
+
# logging_file_paths.append(json_file_path)
|
| 1049 |
+
# request_metadata = request_metadata + "\n" + new_request_metadata
|
| 1050 |
+
# else:
|
| 1051 |
+
# # If the page exists, retrieve the data
|
| 1052 |
+
# text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
| 1053 |
|
| 1054 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
| 1055 |
|
|
|
|
| 1169 |
|
| 1170 |
annotations_all_pages.append(image_annotations)
|
| 1171 |
|
| 1172 |
+
if analysis_type == textract_option:
|
| 1173 |
+
# Write the updated existing textract data back to the JSON file
|
| 1174 |
+
with open(json_file_path, 'w') as json_file:
|
| 1175 |
+
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 1176 |
+
|
| 1177 |
current_loop_page += 1
|
| 1178 |
|
| 1179 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
|
|
|
| 1192 |
progress.close(_tqdm=progress_bar)
|
| 1193 |
tqdm._instances.clear()
|
| 1194 |
|
| 1195 |
+
if analysis_type == textract_option:
|
| 1196 |
+
# Write the updated existing textract data back to the JSON file
|
| 1197 |
+
with open(json_file_path, 'w') as json_file:
|
| 1198 |
+
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 1199 |
+
|
| 1200 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1201 |
+
|
| 1202 |
+
if analysis_type == textract_option:
|
| 1203 |
+
# Write the updated existing textract data back to the JSON file
|
| 1204 |
+
|
| 1205 |
+
with open(json_file_path, 'w') as json_file:
|
| 1206 |
+
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
| 1207 |
|
| 1208 |
return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
| 1209 |
|
|
|
|
| 1736 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, annotations_on_page, image)
|
| 1737 |
|
| 1738 |
#print("Did redact_page_with_pymupdf function")
|
| 1739 |
+
reported_page_no = page_no + 1
|
| 1740 |
+
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
| 1741 |
|
| 1742 |
# Write logs
|
| 1743 |
# Create decision process table
|
tools/helper_functions.py
CHANGED
|
@@ -31,9 +31,9 @@ def get_or_create_env_var(var_name, default_value):
|
|
| 31 |
|
| 32 |
|
| 33 |
# Names for options labels
|
| 34 |
-
text_ocr_option = "
|
| 35 |
-
tesseract_ocr_option = "OCR
|
| 36 |
-
textract_option = "
|
| 37 |
|
| 38 |
local_pii_detector = "Local"
|
| 39 |
aws_pii_detector = "AWS Comprehend"
|
|
@@ -263,6 +263,11 @@ async def get_connection_params(request: gr.Request):
|
|
| 263 |
base_folder = "user-files/"
|
| 264 |
print("Cognito ID found:", out_session_hash)
|
| 265 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
else:
|
| 267 |
out_session_hash = request.session_hash
|
| 268 |
base_folder = "temp-files/"
|
|
|
|
| 31 |
|
| 32 |
|
| 33 |
# Names for options labels
|
| 34 |
+
text_ocr_option = "Local model - selectable text"
|
| 35 |
+
tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
|
| 36 |
+
textract_option = "AWS Textract service - all PDF types"
|
| 37 |
|
| 38 |
local_pii_detector = "Local"
|
| 39 |
aws_pii_detector = "AWS Comprehend"
|
|
|
|
| 263 |
base_folder = "user-files/"
|
| 264 |
print("Cognito ID found:", out_session_hash)
|
| 265 |
|
| 266 |
+
elif 'x-amzn-oidc-identity' in request.headers:
|
| 267 |
+
out_session_hash = request.headers['x-amzn-oidc-identity']
|
| 268 |
+
base_folder = "user-files/"
|
| 269 |
+
print("Cognito ID found:", out_session_hash)
|
| 270 |
+
|
| 271 |
else:
|
| 272 |
out_session_hash = request.session_hash
|
| 273 |
base_folder = "temp-files/"
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
|
@@ -8,20 +8,20 @@ from spacy.cli.download import download
|
|
| 8 |
import re
|
| 9 |
|
| 10 |
# %%
|
| 11 |
-
model_name = "
|
| 12 |
score_threshold = 0.001
|
| 13 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
| 14 |
|
| 15 |
#Load spacy model
|
| 16 |
try:
|
| 17 |
-
import
|
| 18 |
-
nlp =
|
| 19 |
print("Successfully imported spaCy model")
|
| 20 |
|
| 21 |
except:
|
| 22 |
download(model_name)
|
| 23 |
nlp = spacy.load(model_name)
|
| 24 |
-
print("Successfully downloaded and imported spaCy model")
|
| 25 |
|
| 26 |
# #### Custom recognisers
|
| 27 |
# Allow user to create their own recogniser
|
|
|
|
| 8 |
import re
|
| 9 |
|
| 10 |
# %%
|
| 11 |
+
model_name = "en_core_web_sm" #"en_core_web_trf"
|
| 12 |
score_threshold = 0.001
|
| 13 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
|
| 14 |
|
| 15 |
#Load spacy model
|
| 16 |
try:
|
| 17 |
+
import en_core_web_sm
|
| 18 |
+
nlp = en_core_web_sm.load()
|
| 19 |
print("Successfully imported spaCy model")
|
| 20 |
|
| 21 |
except:
|
| 22 |
download(model_name)
|
| 23 |
nlp = spacy.load(model_name)
|
| 24 |
+
print("Successfully downloaded and imported spaCy model", model_name)
|
| 25 |
|
| 26 |
# #### Custom recognisers
|
| 27 |
# Allow user to create their own recogniser
|