Comprehend now uses custom spacy recognisers on top of defaults. Added zoom functionality to annotator. Fixed some pdf mediabox issues and redacted image output issues.
Browse files- +37 -14
- tools/ +1 -1
- tools/ +10 -0
- tools/ +6 -4
- tools/ +126 -46
- tools/ +12 -1
- tools/ +1 -0
- tools/ +16 -3
@@ -13,9 +13,10 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14 |
from tools.file_redaction import choose_and_run_redactor
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16 |
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
17 |
from tools.data_anonymise import anonymise_data_files
18 |
from tools.auth import authenticate_user
19 |
20 |
21 |
today_rev ="%Y%m%d")
@@ -29,6 +30,10 @@ chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT
29 |
30 |
31 |
32 |
33 |
34 |
@@ -117,6 +122,12 @@ with app:
117 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
118 |
119 |
120 |
121 |
122 |
@@ -164,6 +175,9 @@ with app:
164 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
165 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
166 |
annotation_next_page_button = gr.Button("Next page", scale = 3)
167 |
168 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
169 |
@@ -238,9 +252,9 @@ with app:
238 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
239 |
240 |
with gr.Accordion("Add or remove entity types to redact", open = False):
241 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
242 |
243 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
244 |
245 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
246 |
#with gr.Row():
@@ -260,18 +274,19 @@ with app:
260 |
261 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
262 |
263 |
- = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox]).\
264 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
265 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
266 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc")
267 |
268 |
269 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
270 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
271 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number])
272 |
273 |
# If a file has been completed, the function will continue onto the next document
274 |
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
275 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
276 |
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
277 |
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
@@ -284,12 +299,20 @@ with app:
284 |
# Page controls at top
285 |
286 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
287 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
288 |
289 |, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
290 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
291 |, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
292 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
293 |
294 |, annotator, json_boxes)
295 |, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
@@ -297,12 +320,12 @@ with app:
297 |
# Page controls at bottom
298 |
299 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
300 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
301 |
302 |, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
303 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
304 |, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
305 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
306 |
307 |
308 |
13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
14 |
from tools.file_redaction import choose_and_run_redactor
15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
16 |
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
17 |
from tools.data_anonymise import anonymise_data_files
18 |
from tools.auth import authenticate_user
19 |
from tools.load_spacy_model_custom_recognisers import custom_entities
20 |
21 |
22 |
today_rev ="%Y%m%d")
30 |
31 |
32 |
33 |
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
34 |
35 |
36 |
37 |
38 |
39 |
122 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
123 |
124 |
125 |
## Annotator zoom value
126 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
127 |
zoom_true_bool = gr.State(True)
128 |
zoom_false_bool = gr.State(False)
129 |
130 |
131 |
132 |
133 |
175 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
176 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
177 |
annotation_next_page_button = gr.Button("Next page", scale = 3)
178 |
with gr.Row():
179 |
annotate_zoom_in = gr.Button("Zoom in")
180 |
annotate_zoom_out = gr.Button("Zoom out")
181 |
182 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
183 |
252 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
253 |
254 |
with gr.Accordion("Add or remove entity types to redact", open = False):
255 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
256 |
257 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
258 |
259 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
260 |
#with gr.Row():
274 |
275 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
276 |
277 |
+ = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
278 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
279 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
280 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
281 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
282 |
283 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
284 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
285 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
286 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
287 |
288 |
# If a file has been completed, the function will continue onto the next document
289 |
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
290 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
291 |
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
292 |
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
299 |
# Page controls at top
300 |
301 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
302 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
303 |
304 |, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
305 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
306 |, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
307 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
308 |
309 |
# Zoom in and out on annotator
310 |
+, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
311 |
then(update_zoom, inputs=[annotator_zoom_number, zoom_true_bool], outputs=[annotator_zoom_number]).\
312 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
313 |
+, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
314 |
then(update_zoom, inputs=[annotator_zoom_number, zoom_false_bool], outputs=[annotator_zoom_number]).\
315 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
316 |
317 |, annotator, json_boxes)
318 |, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
320 |
# Page controls at bottom
321 |
322 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
323 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
324 |
325 |, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
326 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
327 |, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
328 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
329 |
330 |
331 |
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
10 |
# Get AWS credentials
11 |
12 |
13 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
15 |
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
10 |
# Get AWS credentials
11 |
12 |
13 |
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
15 |
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
@@ -13,6 +13,7 @@ from copy import deepcopy
13 |
from tools.helper_functions import clean_unicode_text
14 |
from tools.aws_functions import comprehend_client
15 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
16 |
#import string # Import string to get a list of common punctuation characters
17 |
18 |
@@ -491,6 +492,14 @@ class CustomImageAnalyzerEngine:
491 |
analyzer_results_by_line[i] = analyzer_result
492 |
493 |
elif pii_identification_method == "AWS Comprehend":
494 |
if len(line_level_ocr_result.text) >= 3:
495 |
# Add line to current batch with a separator
496 |
if current_batch:
@@ -509,6 +518,7 @@ class CustomImageAnalyzerEngine:
509 |
510 |
511 |
512 |
except Exception as e:
513 |
514 |
13 |
from tools.helper_functions import clean_unicode_text
14 |
from tools.aws_functions import comprehend_client
15 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
16 |
from tools.load_spacy_model_custom_recognisers import custom_entities
17 |
#import string # Import string to get a list of common punctuation characters
18 |
19 |
492 |
analyzer_results_by_line[i] = analyzer_result
493 |
494 |
elif pii_identification_method == "AWS Comprehend":
495 |
496 |
# If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
497 |
text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
498 |
499 |
spacy_analyzer_result = self.analyzer_engine.analyze(
500 |
text=line_level_ocr_result.text, **text_analyzer_kwargs)
501 |
502 |
503 |
if len(line_level_ocr_result.text) >= 3:
504 |
# Add line to current batch with a separator
505 |
if current_batch:
518 |
519 |
520 |
521 |
522 |
except Exception as e:
523 |
524 |
@@ -11,6 +11,8 @@ import pymupdf
11 |
from gradio import Progress
12 |
from typing import List, Optional
13 |
14 |
def is_pdf_or_image(filename):
15 |
16 |
Check if a file name is a PDF or an image file.
@@ -42,7 +44,7 @@ def is_pdf(filename):
42 |
# %%
43 |
## Convert pdf to image if necessary
44 |
45 |
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
46 |
47 |
# Get the number of pages in the PDF
48 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
@@ -70,7 +72,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
70 |
71 |
72 |
73 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=
74 |
75 |
image = image_l[0]
76 |
@@ -334,7 +336,7 @@ def prepare_image_or_pdf(
334 |
335 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
336 |
337 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
338 |
file_path_without_ext = get_file_path_end(in_file_path)
339 |
340 |
out_file_paths = out_text_file_path
@@ -344,7 +346,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
344 |
345 |
pdf_text_image_paths = process_file(out_text_file_path[0])
346 |
out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
347 |
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=
348 |
349 |
# out_file_paths.append(out_text_image_file_path)
350 |
11 |
from gradio import Progress
12 |
from typing import List, Optional
13 |
14 |
image_dpi = 300.0
15 |
16 |
def is_pdf_or_image(filename):
17 |
18 |
Check if a file name is a PDF or an image file.
44 |
# %%
45 |
## Convert pdf to image if necessary
46 |
47 |
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
48 |
49 |
# Get the number of pages in the PDF
50 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
72 |
73 |
74 |
75 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
76 |
77 |
image = image_l[0]
78 |
336 |
337 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
338 |
339 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
340 |
file_path_without_ext = get_file_path_end(in_file_path)
341 |
342 |
out_file_paths = out_text_file_path
346 |
347 |
pdf_text_image_paths = process_file(out_text_file_path[0])
348 |
out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
349 |
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
350 |
351 |
# out_file_paths.append(out_text_image_file_path)
352 |
@@ -27,8 +27,8 @@ from collections import defaultdict # For efficient grouping
27 |
from presidio_analyzer import RecognizerResult
28 |
29 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
30 |
from tools.file_conversion import process_file
31 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
32 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
33 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
34 |
# from tools.data_anonymise import generate_decision_process_output
@@ -314,8 +314,8 @@ def choose_and_run_redactor(file_paths:List[str],
314 |
315 |
# Save file
316 |
if is_pdf(file_path) == False:
317 |
out_image_file_path = output_folder + file_path_without_ext + "
318 |
pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=
319 |
320 |
321 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
@@ -413,35 +413,40 @@ def choose_and_run_redactor(file_paths:List[str],
413 |
414 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
415 |
416 |
417 |
418 |
Convert annotations from pikepdf to pymupdf format
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
429 |
430 |
431 |
432 |
433 |
434 |
435 |
rect_coordinates = [float(coord) for coord in rect_field]
436 |
437 |
438 |
x1, y1, x2, y2 = rect_coordinates
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
447 |
@@ -496,6 +501,64 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
496 |
497 |
return x1, new_y1, x2, new_y2
498 |
499 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
500 |
501 |
Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
@@ -587,25 +650,25 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
587 |
588 |
# Else it should be a pikepdf annotation object
589 |
590 |
x1, pymupdf_y1, x2, pymupdf_y2 =
591 |
592 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
593 |
594 |
img_annotation_box = {}
595 |
596 |
if image:
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
if isinstance(annot, Dictionary):
606 |
#print("Trying to get label out of annotation", annot["/T"])
607 |
img_annotation_box["label"] = str(annot["/T"])
608 |
#print("Label is:", img_annotation_box["label"])
609 |
610 |
img_annotation_box["label"] = "REDACTION"
611 |
@@ -646,6 +709,18 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
646 |
merged_bboxes = []
647 |
grouped_bboxes = defaultdict(list)
648 |
649 |
# Reconstruct bounding boxes for substrings of interest
650 |
reconstructed_bboxes = []
651 |
for bbox in bboxes:
@@ -735,16 +810,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
735 |
736 |
737 |
738 |
# Process signature and handwriting results
739 |
if signature_recogniser_results or handwriting_recogniser_results:
740 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
741 |
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
742 |
743 |
744 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
745 |
#print("Signature boxes exist at merge:", signature_recogniser_results)
746 |
747 |
748 |
#print("bboxes:", bboxes)
749 |
750 |
return merged_bboxes
@@ -1483,6 +1548,21 @@ def redact_text_pdf(
1483 |
all_text_line_results.append((i, text_line_analyser_result))
1484 |
1485 |
elif pii_identification_method == "AWS Comprehend":
1486 |
if len(text_line.text) >= 3:
1487 |
# Add separator between lines
1488 |
if current_batch:
27 |
from presidio_analyzer import RecognizerResult
28 |
29 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
30 |
from tools.file_conversion import process_file, image_dpi
31 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
32 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
33 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
34 |
# from tools.data_anonymise import generate_decision_process_output
314 |
315 |
# Save file
316 |
if is_pdf(file_path) == False:
317 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_pdf.pdf"
318 |
pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
319 |
320 |
321 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
413 |
414 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
415 |
416 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
417 |
418 |
Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
419 |
420 |
# Use cropbox if available, otherwise use mediabox
421 |
reference_box = pymupdf_page.rect
422 |
mediabox = pymupdf_page.mediabox
423 |
424 |
reference_box_height = reference_box.height
425 |
reference_box_width = reference_box.width
426 |
427 |
# Convert PyMuPDF coordinates back to PDF coordinates (bottom-left origin)
428 |
media_height = mediabox.height
429 |
media_width = mediabox.width
430 |
431 |
media_reference_y_diff = media_height - reference_box_height
432 |
media_reference_x_diff = media_width - reference_box_width
433 |
434 |
y_diff_ratio = media_reference_y_diff / reference_box_height
435 |
x_diff_ratio = media_reference_x_diff / reference_box_width
436 |
437 |
# Extract the annotation rectangle field
438 |
rect_field = pikepdf_bbox["/Rect"]
439 |
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
440 |
441 |
# Unpack coordinates
442 |
x1, y1, x2, y2 = rect_coordinates
443 |
444 |
new_x1 = x1 - (media_reference_x_diff * x_diff_ratio)
445 |
new_y1 = media_height - y2 - (media_reference_y_diff * y_diff_ratio)
446 |
new_x2 = x2 - (media_reference_x_diff * x_diff_ratio)
447 |
new_y2 = media_height - y1 - (media_reference_y_diff * y_diff_ratio)
448 |
449 |
return new_x1, new_y1, new_x2, new_y2
450 |
451 |
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
452 |
501 |
502 |
return x1, new_y1, x2, new_y2
503 |
504 |
# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
505 |
# '''
506 |
# Converts coordinates from pymupdf format to image coordinates.
507 |
# '''
508 |
509 |
# rect_height = pymupdf_page.rect.height
510 |
# rect_width = pymupdf_page.rect.width
511 |
512 |
# image_page_width, image_page_height = image.size
513 |
514 |
# # Calculate scaling factors between pymupdf and PIL image
515 |
# scale_width = image_page_width / rect_width
516 |
# scale_height = image_page_height / rect_height
517 |
518 |
# x1_image = x1 * scale_width
519 |
# y1_image = ((rect_height - y2) * scale_height)
520 |
# x2_image = x2 * scale_width
521 |
# y2_image = ((rect_height - y1) * scale_height)
522 |
523 |
# return x1_image, y1_image, x2_image, y2_image
524 |
525 |
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
526 |
527 |
Converts coordinates from pymupdf format to image coordinates,
528 |
accounting for mediabox dimensions.
529 |
530 |
531 |
rect_height = pymupdf_page.rect.height
532 |
rect_width = pymupdf_page.rect.width
533 |
534 |
# Get mediabox dimensions
535 |
mediabox = pymupdf_page.mediabox
536 |
mediabox_width = mediabox.width
537 |
mediabox_height = mediabox.height
538 |
539 |
image_page_width, image_page_height = image.size
540 |
541 |
# Calculate scaling factors using mediabox dimensions
542 |
scale_width = image_page_width / mediabox_width
543 |
scale_height = image_page_height / mediabox_height
544 |
545 |
print("scale_width:", scale_width)
546 |
print("scale_height:", scale_height)
547 |
548 |
rect_to_mediabox_x_scale = mediabox_width / rect_width
549 |
rect_to_mediabox_y_scale = mediabox_height / rect_height
550 |
551 |
print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
552 |
print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
553 |
554 |
# Adjust coordinates based on scaling factors
555 |
x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
556 |
y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
557 |
x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
558 |
y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
559 |
560 |
return x1_image, y1_image, x2_image, y2_image
561 |
562 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
563 |
564 |
Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
650 |
651 |
# Else it should be a pikepdf annotation object
652 |
653 |
x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
654 |
655 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
656 |
657 |
img_annotation_box = {}
658 |
659 |
if image:
660 |
img_width, img_height = image.size
661 |
662 |
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
663 |
664 |
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
665 |
img_annotation_box["ymin"] = image_y1 #* (img_width / rect_width) # Use adjusted y1
666 |
img_annotation_box["xmax"] = x2# * (img_height / rect_height) # Use adjusted x2
667 |
img_annotation_box["ymax"] = image_y2 #* (img_height / rect_height) # Use adjusted y2
668 |
img_annotation_box["color"] = (0, 0, 0)
669 |
670 |
if isinstance(annot, Dictionary):
671 |
img_annotation_box["label"] = str(annot["/T"])
672 |
673 |
img_annotation_box["label"] = "REDACTION"
674 |
709 |
merged_bboxes = []
710 |
grouped_bboxes = defaultdict(list)
711 |
712 |
713 |
# Process signature and handwriting results
714 |
if signature_recogniser_results or handwriting_recogniser_results:
715 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
716 |
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
717 |
718 |
719 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
720 |
#print("Signature boxes exist at merge:", signature_recogniser_results)
721 |
722 |
723 |
724 |
# Reconstruct bounding boxes for substrings of interest
725 |
reconstructed_bboxes = []
726 |
for bbox in bboxes:
810 |
811 |
812 |
813 |
#print("bboxes:", bboxes)
814 |
815 |
return merged_bboxes
1548 |
all_text_line_results.append((i, text_line_analyser_result))
1549 |
1550 |
elif pii_identification_method == "AWS Comprehend":
1551 |
1552 |
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
1553 |
custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
1554 |
1555 |
text_line_analyser_result = nlp_analyser.analyze(
1556 |
1557 |
1558 |
1559 |
1560 |
1561 |
1562 |
1563 |
all_text_line_results.append((i, text_line_analyser_result))
1564 |
1565 |
1566 |
if len(text_line.text) >= 3:
1567 |
# Add separator between lines
1568 |
if current_batch:
@@ -3,9 +3,20 @@ import re
3 |
import gradio as gr
4 |
import pandas as pd
5 |
import unicodedata
6 |
7 |
def reset_state_vars():
8 |
return [], [], pd.DataFrame(), pd.DataFrame(), 0, ""
9 |
10 |
def get_or_create_env_var(var_name, default_value):
11 |
# Get the environment variable if it exists
3 |
import gradio as gr
4 |
import pandas as pd
5 |
import unicodedata
6 |
from gradio_image_annotation import image_annotator
7 |
8 |
def reset_state_vars():
9 |
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
10 |
label="Modify redaction boxes",
11 |
12 |
label_colors=[(0, 0, 0)],
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
def get_or_create_env_var(var_name, default_value):
22 |
# Get the environment variable if it exists
@@ -10,6 +10,7 @@ import re
10 |
# %%
11 |
model_name = "en_core_web_lg" #"en_core_web_trf"
12 |
score_threshold = 0.001
13 |
14 |
# %% [markdown]
15 |
# #### Custom recognisers
10 |
# %%
11 |
model_name = "en_core_web_lg" #"en_core_web_trf"
12 |
score_threshold = 0.001
13 |
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME"]
14 |
15 |
# %% [markdown]
16 |
# #### Custom recognisers
@@ -37,9 +37,22 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
37 |
38 |
return max_pages, max_pages
39 |
40 |
41 |
# print("\nImage annotator object:", image_annotator_object)
42 |
43 |
if not image_annotator_object:
44 |
return image_annotator(
45 |
label="Modify redaction boxes",
@@ -76,8 +89,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
76 |
77 |
#label_colors=[(0, 0, 0)],
78 |
79 |
80 |
81 |
82 |
83 |
37 |
38 |
return max_pages, max_pages
39 |
40 |
def update_zoom(current_zoom_level:int, decrease:bool=True):
41 |
if decrease == False:
42 |
if current_zoom_level >= 50:
43 |
current_zoom_level -= 10
44 |
45 |
if current_zoom_level < 100:
46 |
current_zoom_level += 10
47 |
48 |
return current_zoom_level
49 |
50 |
51 |
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
52 |
# print("\nImage annotator object:", image_annotator_object)
53 |
54 |
zoom_str = str(zoom) + '%'
55 |
56 |
if not image_annotator_object:
57 |
return image_annotator(
58 |
label="Modify redaction boxes",
89 |
90 |
#label_colors=[(0, 0, 0)],
91 |
92 |
93 |
94 |
95 |
96 |