Merge pull request #2 from seanpedrick-case/dev
Browse files- app.py +28 -17
- doc_redaction_amplify_app +1 -0
- tools/auth.py +22 -20
- tools/aws_textract.py +45 -23
- tools/custom_image_analyser_engine.py +27 -11
- tools/file_conversion.py +41 -43
- tools/file_redaction.py +23 -64
- tools/load_spacy_model_custom_recognisers.py +13 -5
- tools/redaction_review.py +53 -58
app.py
CHANGED
@@ -13,7 +13,7 @@ from gradio_image_annotation.image_annotator import AnnotatedImageData
|
|
13 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
-
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
17 |
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
@@ -41,6 +41,8 @@ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREET
|
|
41 |
|
42 |
language = 'en'
|
43 |
|
|
|
|
|
44 |
host_name = socket.gethostname()
|
45 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
46 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
@@ -84,6 +86,8 @@ with app:
|
|
84 |
log_files_output_list_state = gr.State([])
|
85 |
|
86 |
review_file_state = gr.State(pd.DataFrame())
|
|
|
|
|
87 |
|
88 |
# Logging state
|
89 |
log_file_name = 'log.csv'
|
@@ -117,7 +121,7 @@ with app:
|
|
117 |
|
118 |
|
119 |
## Annotator zoom value
|
120 |
-
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=
|
121 |
zoom_true_bool = gr.State(True)
|
122 |
zoom_false_bool = gr.State(False)
|
123 |
|
@@ -344,16 +348,18 @@ with app:
|
|
344 |
# Page controls at top
|
345 |
annotate_current_page.submit(
|
346 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
347 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
348 |
-
|
349 |
-
|
350 |
|
351 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
352 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
353 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
|
|
354 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
355 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
356 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
357 |
|
358 |
# Zoom in and out on annotator
|
359 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
@@ -370,18 +376,23 @@ with app:
|
|
370 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
371 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
372 |
|
|
|
|
|
373 |
# Page controls at bottom
|
374 |
annotate_current_page_bottom.submit(
|
375 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
376 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
377 |
|
378 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
379 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
380 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
381 |
|
382 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
383 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
384 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
385 |
|
386 |
# Review side bar controls
|
387 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
@@ -420,13 +431,13 @@ with app:
|
|
420 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
421 |
|
422 |
# If running on AWS, load in the default allow list file from S3
|
423 |
-
if RUN_AWS_FUNCTIONS == "1":
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
|
431 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
432 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
|
|
13 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
15 |
from tools.file_redaction import choose_and_run_redactor
|
16 |
+
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
17 |
from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
|
18 |
from tools.data_anonymise import anonymise_data_files
|
19 |
from tools.auth import authenticate_user
|
|
|
41 |
|
42 |
language = 'en'
|
43 |
|
44 |
+
|
45 |
+
|
46 |
host_name = socket.gethostname()
|
47 |
feedback_logs_folder = 'feedback/' + today_rev + '/' + host_name + '/'
|
48 |
access_logs_folder = 'logs/' + today_rev + '/' + host_name + '/'
|
|
|
86 |
log_files_output_list_state = gr.State([])
|
87 |
|
88 |
review_file_state = gr.State(pd.DataFrame())
|
89 |
+
|
90 |
+
do_not_save_pdf_state = gr.State(False)
|
91 |
|
92 |
# Logging state
|
93 |
log_file_name = 'log.csv'
|
|
|
121 |
|
122 |
|
123 |
## Annotator zoom value
|
124 |
+
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
|
125 |
zoom_true_bool = gr.State(True)
|
126 |
zoom_false_bool = gr.State(False)
|
127 |
|
|
|
348 |
# Page controls at top
|
349 |
annotate_current_page.submit(
|
350 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
351 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
352 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
|
|
353 |
|
354 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
355 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
356 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
357 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
358 |
+
|
359 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
360 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
361 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
362 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
363 |
|
364 |
# Zoom in and out on annotator
|
365 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
|
|
376 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
377 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
378 |
|
379 |
+
do_not_save_pdf_state
|
380 |
+
|
381 |
# Page controls at bottom
|
382 |
annotate_current_page_bottom.submit(
|
383 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
384 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
385 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
386 |
|
387 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
388 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
389 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
390 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
391 |
|
392 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
393 |
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
394 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
395 |
+
then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
396 |
|
397 |
# Review side bar controls
|
398 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
|
|
431 |
app.load(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
432 |
|
433 |
# If running on AWS, load in the default allow list file from S3
|
434 |
+
# if RUN_AWS_FUNCTIONS == "1":
|
435 |
+
# print("default_allow_list_output_folder_location:", default_allow_list_loc)
|
436 |
+
# if not os.path.exists(default_allow_list_loc):
|
437 |
+
# app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
|
438 |
+
# then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
439 |
+
# else:
|
440 |
+
# app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
441 |
|
442 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
443 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
doc_redaction_amplify_app
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Subproject commit 9585642e4d1f72fc49971789693d5584661084c8
|
tools/auth.py
CHANGED
@@ -7,13 +7,13 @@ import base64
|
|
7 |
from tools.helper_functions import get_or_create_env_var
|
8 |
|
9 |
client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
|
10 |
-
print(f'The value of AWS_CLIENT_ID is {client_id}')
|
11 |
|
12 |
client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
|
13 |
-
print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
|
14 |
|
15 |
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
|
16 |
-
print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
|
17 |
|
18 |
def calculate_secret_hash(client_id, client_secret, username):
|
19 |
message = username + client_id
|
@@ -46,24 +46,26 @@ def authenticate_user(username:str, password:str, user_pool_id:str=user_pool_id,
|
|
46 |
|
47 |
try:
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
67 |
|
68 |
# If successful, you'll receive an AuthenticationResult in the response
|
69 |
if response.get('AuthenticationResult'):
|
|
|
7 |
from tools.helper_functions import get_or_create_env_var
|
8 |
|
9 |
client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
|
10 |
+
#print(f'The value of AWS_CLIENT_ID is {client_id}')
|
11 |
|
12 |
client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
|
13 |
+
#print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
|
14 |
|
15 |
user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
|
16 |
+
#print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
|
17 |
|
18 |
def calculate_secret_hash(client_id, client_secret, username):
|
19 |
message = username + client_id
|
|
|
46 |
|
47 |
try:
|
48 |
|
49 |
+
if client_secret == '':
|
50 |
+
response = client.initiate_auth(
|
51 |
+
AuthFlow='USER_PASSWORD_AUTH',
|
52 |
+
AuthParameters={
|
53 |
+
'USERNAME': username,
|
54 |
+
'PASSWORD': password,
|
55 |
+
},
|
56 |
+
ClientId=client_id
|
57 |
+
)
|
58 |
|
59 |
+
else:
|
60 |
+
response = client.initiate_auth(
|
61 |
+
AuthFlow='USER_PASSWORD_AUTH',
|
62 |
+
AuthParameters={
|
63 |
+
'USERNAME': username,
|
64 |
+
'PASSWORD': password,
|
65 |
+
'SECRET_HASH': secret_hash
|
66 |
+
},
|
67 |
+
ClientId=client_id
|
68 |
+
)
|
69 |
|
70 |
# If successful, you'll receive an AuthenticationResult in the response
|
71 |
if response.get('AuthenticationResult'):
|
tools/aws_textract.py
CHANGED
@@ -145,8 +145,9 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
|
145 |
|
146 |
# Extract text and bounding box for the line
|
147 |
line_text = text_block.get('Text', '')
|
148 |
-
|
149 |
words = []
|
|
|
|
|
150 |
if 'Relationships' in text_block:
|
151 |
for relationship in text_block['Relationships']:
|
152 |
if relationship['Type'] == 'CHILD':
|
@@ -179,35 +180,56 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
|
|
179 |
if text_type == "HANDWRITING":
|
180 |
is_handwriting = True
|
181 |
entity_name = "HANDWRITING"
|
182 |
-
word_end = len(
|
183 |
-
|
184 |
-
recogniser_result = CustomImageRecognizerResult(
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
189 |
|
190 |
# If handwriting or signature, add to bounding box
|
191 |
|
192 |
elif (text_block['BlockType'] == 'SIGNATURE'):
|
193 |
line_text = "SIGNATURE"
|
194 |
-
|
195 |
is_signature = True
|
196 |
entity_name = "SIGNATURE"
|
197 |
-
confidence = text_block
|
198 |
-
word_end = len(
|
199 |
-
|
200 |
-
recogniser_result = CustomImageRecognizerResult(
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
ocr_results_with_children["text_line_" + str(i)] = {
|
213 |
"line": i,
|
|
|
145 |
|
146 |
# Extract text and bounding box for the line
|
147 |
line_text = text_block.get('Text', '')
|
|
|
148 |
words = []
|
149 |
+
current_line_handwriting_results = [] # Track handwriting results for this line
|
150 |
+
|
151 |
if 'Relationships' in text_block:
|
152 |
for relationship in text_block['Relationships']:
|
153 |
if relationship['Type'] == 'CHILD':
|
|
|
180 |
if text_type == "HANDWRITING":
|
181 |
is_handwriting = True
|
182 |
entity_name = "HANDWRITING"
|
183 |
+
word_end = len(word_text)
|
184 |
+
|
185 |
+
recogniser_result = CustomImageRecognizerResult(
|
186 |
+
entity_type=entity_name,
|
187 |
+
text=word_text,
|
188 |
+
score=confidence,
|
189 |
+
start=0,
|
190 |
+
end=word_end,
|
191 |
+
left=word_left,
|
192 |
+
top=word_top,
|
193 |
+
width=word_width_abs,
|
194 |
+
height=word_height_abs
|
195 |
+
)
|
196 |
+
|
197 |
+
# Add to handwriting collections immediately
|
198 |
+
handwriting.append(recogniser_result)
|
199 |
+
handwriting_recogniser_results.append(recogniser_result)
|
200 |
+
signature_or_handwriting_recogniser_results.append(recogniser_result)
|
201 |
+
current_line_handwriting_results.append(recogniser_result)
|
202 |
|
203 |
# If handwriting or signature, add to bounding box
|
204 |
|
205 |
elif (text_block['BlockType'] == 'SIGNATURE'):
|
206 |
line_text = "SIGNATURE"
|
|
|
207 |
is_signature = True
|
208 |
entity_name = "SIGNATURE"
|
209 |
+
confidence = text_block.get('Confidence', 0)
|
210 |
+
word_end = len(line_text)
|
211 |
+
|
212 |
+
recogniser_result = CustomImageRecognizerResult(
|
213 |
+
entity_type=entity_name,
|
214 |
+
text=line_text,
|
215 |
+
score=confidence,
|
216 |
+
start=0,
|
217 |
+
end=word_end,
|
218 |
+
left=line_left,
|
219 |
+
top=line_top,
|
220 |
+
width=width_abs,
|
221 |
+
height=height_abs
|
222 |
+
)
|
223 |
+
|
224 |
+
# Add to signature collections immediately
|
225 |
+
signatures.append(recogniser_result)
|
226 |
+
signature_recogniser_results.append(recogniser_result)
|
227 |
+
signature_or_handwriting_recogniser_results.append(recogniser_result)
|
228 |
+
|
229 |
+
words = [{
|
230 |
+
'text': line_text,
|
231 |
+
'bounding_box': (line_left, line_top, line_right, line_bottom)
|
232 |
+
}]
|
233 |
|
234 |
ocr_results_with_children["text_line_" + str(i)] = {
|
235 |
"line": i,
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -14,6 +14,7 @@ from tools.helper_functions import clean_unicode_text
|
|
14 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
#import string # Import string to get a list of common punctuation characters
|
|
|
17 |
|
18 |
@dataclass
|
19 |
class OCRResult:
|
@@ -493,11 +494,12 @@ class CustomImageAnalyzerEngine:
|
|
493 |
|
494 |
elif pii_identification_method == "AWS Comprehend":
|
495 |
|
496 |
-
# If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
|
497 |
text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
498 |
|
499 |
spacy_analyzer_result = self.analyzer_engine.analyze(
|
500 |
text=line_level_ocr_result.text, **text_analyzer_kwargs)
|
|
|
501 |
analyzer_results_by_line[i].extend(spacy_analyzer_result)
|
502 |
|
503 |
if len(line_level_ocr_result.text) >= 3:
|
@@ -573,7 +575,7 @@ class CustomImageAnalyzerEngine:
|
|
573 |
for result in analyzer_result:
|
574 |
# Extract the relevant portion of text based on start and end
|
575 |
relevant_text = line_level_ocr_results[i].text[result.start:result.end]
|
576 |
-
|
577 |
# Find the corresponding entry in ocr_results_with_children
|
578 |
child_words = ocr_results_with_children_line_level['words']
|
579 |
|
@@ -583,13 +585,23 @@ class CustomImageAnalyzerEngine:
|
|
583 |
word_num = 0 # Initialize word count
|
584 |
total_width = 0 # Initialize total width
|
585 |
|
586 |
-
|
587 |
-
|
588 |
-
|
589 |
-
|
590 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
591 |
found_word = word
|
592 |
-
#print("found_word:", found_word)
|
593 |
|
594 |
if word_num == 0: # First word
|
595 |
left = found_word['bounding_box'][0]
|
@@ -598,6 +610,10 @@ class CustomImageAnalyzerEngine:
|
|
598 |
all_words += found_word['text'] + " " # Concatenate words
|
599 |
total_width = found_word['bounding_box'][2] - left # Add each word's width
|
600 |
word_num += 1
|
|
|
|
|
|
|
|
|
601 |
break # Move to the next word in relevant_text
|
602 |
|
603 |
width = total_width + horizontal_buffer # Set width to total width of all matched words
|
@@ -621,9 +637,9 @@ class CustomImageAnalyzerEngine:
|
|
621 |
result_reset_pos.start = 0
|
622 |
result_reset_pos.end = len(relevant_text)
|
623 |
|
624 |
-
|
625 |
-
|
626 |
-
|
627 |
|
628 |
# Map the analyzer results to bounding boxes for this line
|
629 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
|
|
14 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
#import string # Import string to get a list of common punctuation characters
|
17 |
+
import re # Add this import at the top of the file
|
18 |
|
19 |
@dataclass
|
20 |
class OCRResult:
|
|
|
494 |
|
495 |
elif pii_identification_method == "AWS Comprehend":
|
496 |
|
497 |
+
# If using AWS Comprehend, Spacy model is only used to identify the custom entities created. This is because Comprehend can't pick up Titles, Streetnames, and UKPostcodes, or a custom deny list specifically
|
498 |
text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
499 |
|
500 |
spacy_analyzer_result = self.analyzer_engine.analyze(
|
501 |
text=line_level_ocr_result.text, **text_analyzer_kwargs)
|
502 |
+
|
503 |
analyzer_results_by_line[i].extend(spacy_analyzer_result)
|
504 |
|
505 |
if len(line_level_ocr_result.text) >= 3:
|
|
|
575 |
for result in analyzer_result:
|
576 |
# Extract the relevant portion of text based on start and end
|
577 |
relevant_text = line_level_ocr_results[i].text[result.start:result.end]
|
578 |
+
|
579 |
# Find the corresponding entry in ocr_results_with_children
|
580 |
child_words = ocr_results_with_children_line_level['words']
|
581 |
|
|
|
585 |
word_num = 0 # Initialize word count
|
586 |
total_width = 0 # Initialize total width
|
587 |
|
588 |
+
split_relevant_text = relevant_text.split()
|
589 |
+
|
590 |
+
loop_child_words = child_words.copy()
|
591 |
+
|
592 |
+
for word_text in split_relevant_text: # Iterate through each word in relevant_text
|
593 |
+
|
594 |
+
quote_str = '"'
|
595 |
+
replace_str = '(?:"|"|")'
|
596 |
+
|
597 |
+
word_regex = rf'(?<!\w){re.escape(word_text.strip()).replace(quote_str, replace_str)}(?!\w)'
|
598 |
+
|
599 |
+
for word in loop_child_words:
|
600 |
+
# Check for regex as whole word
|
601 |
+
|
602 |
+
if re.search(word_regex, word['text']):
|
603 |
+
#if re.search(r'\b' + re.escape(word_text) + r'\b', word['text']):
|
604 |
found_word = word
|
|
|
605 |
|
606 |
if word_num == 0: # First word
|
607 |
left = found_word['bounding_box'][0]
|
|
|
610 |
all_words += found_word['text'] + " " # Concatenate words
|
611 |
total_width = found_word['bounding_box'][2] - left # Add each word's width
|
612 |
word_num += 1
|
613 |
+
|
614 |
+
# Drop the first word of child_words
|
615 |
+
loop_child_words = loop_child_words[1:] # Skip the first word
|
616 |
+
|
617 |
break # Move to the next word in relevant_text
|
618 |
|
619 |
width = total_width + horizontal_buffer # Set width to total width of all matched words
|
|
|
637 |
result_reset_pos.start = 0
|
638 |
result_reset_pos.end = len(relevant_text)
|
639 |
|
640 |
+
print("result_reset_pos:", result_reset_pos)
|
641 |
+
print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
642 |
+
print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
643 |
|
644 |
# Map the analyzer results to bounding boxes for this line
|
645 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
tools/file_conversion.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
-
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option,
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
import os
|
@@ -48,7 +48,8 @@ def is_pdf(filename):
|
|
48 |
# %%
|
49 |
## Convert pdf to image if necessary
|
50 |
|
51 |
-
|
|
|
52 |
|
53 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
54 |
try:
|
@@ -261,7 +262,10 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
|
|
261 |
else:
|
262 |
out_colour = img_annotation_box["color"]
|
263 |
else:
|
264 |
-
|
|
|
|
|
|
|
265 |
|
266 |
shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
|
267 |
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
@@ -478,11 +482,12 @@ def prepare_image_or_pdf(
|
|
478 |
annotation["image"] = image_path
|
479 |
|
480 |
all_annotations_object.append(annotation)
|
481 |
-
|
482 |
-
#print("all_annotations_object:", all_annotations_object)
|
483 |
-
|
484 |
|
485 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
|
|
|
|
|
|
|
|
486 |
# Convert image to a pymupdf document
|
487 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
488 |
|
@@ -491,14 +496,16 @@ def prepare_image_or_pdf(
|
|
491 |
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
492 |
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
493 |
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
-
# Check if the file is an image type and the user selected text ocr option
|
496 |
-
elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
497 |
-
in_redact_method = tesseract_ocr_option
|
498 |
|
499 |
elif file_extension in ['.csv']:
|
500 |
review_file_csv = read_file(file)
|
501 |
-
all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
|
502 |
json_from_csv = True
|
503 |
print("Converted CSV review file to json")
|
504 |
|
@@ -618,12 +625,7 @@ def prepare_image_or_pdf(
|
|
618 |
out_message.append(out_time)
|
619 |
out_message_out = '\n'.join(out_message)
|
620 |
|
621 |
-
#if prepare_for_review == False:
|
622 |
number_of_pages = len(image_file_paths)
|
623 |
-
#else:
|
624 |
-
# number_of_pages = len(all_annotations_object)
|
625 |
-
|
626 |
-
#print("all_annotations_object at end:", all_annotations_object)
|
627 |
|
628 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
629 |
|
@@ -650,23 +652,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
|
|
650 |
|
651 |
return out_message, out_file_paths
|
652 |
|
653 |
-
# Example DataFrames
|
654 |
-
# df1 = pd.DataFrame({
|
655 |
-
# 'xmin': [10, 20, 30],
|
656 |
-
# 'xmax': [15, 25, 35],
|
657 |
-
# 'ymin': [40, 50, 60],
|
658 |
-
# 'ymax': [45, 55, 65],
|
659 |
-
# 'info1': ['A', 'B', 'C']
|
660 |
-
# })
|
661 |
-
|
662 |
-
# df2 = pd.DataFrame({
|
663 |
-
# 'xmin': [12, 18, 32],
|
664 |
-
# 'xmax': [14, 24, 34],
|
665 |
-
# 'ymin': [42, 48, 62],
|
666 |
-
# 'ymax': [44, 54, 66],
|
667 |
-
# 'info2': ['X', 'Y', 'Z']
|
668 |
-
# })
|
669 |
-
|
670 |
def join_values_within_threshold(df1, df2):
|
671 |
# Threshold for matching
|
672 |
threshold = 5
|
@@ -757,25 +742,38 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
|
|
757 |
|
758 |
return df
|
759 |
|
760 |
-
def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
|
|
|
|
|
|
|
761 |
# Keep only necessary columns
|
762 |
df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
763 |
|
764 |
# Group the DataFrame by the 'image' column
|
765 |
-
|
766 |
|
767 |
# Create a list to hold the JSON data
|
768 |
json_data = []
|
769 |
|
770 |
-
|
771 |
-
|
772 |
-
|
773 |
-
|
774 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
775 |
# Append the structured data to the json_data list
|
776 |
-
json_data.append(
|
777 |
-
"image": image_path,
|
778 |
-
"boxes": boxes
|
779 |
-
})
|
780 |
|
781 |
return json_data
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
import os
|
|
|
48 |
# %%
|
49 |
## Convert pdf to image if necessary
|
50 |
|
51 |
+
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
52 |
+
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
53 |
|
54 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
55 |
try:
|
|
|
262 |
else:
|
263 |
out_colour = img_annotation_box["color"]
|
264 |
else:
|
265 |
+
if CUSTOM_BOX_COLOUR == "grey":
|
266 |
+
out_colour = (0.5, 0.5, 0.5)
|
267 |
+
else:
|
268 |
+
out_colour = (0,0,0)
|
269 |
|
270 |
shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
|
271 |
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
|
|
482 |
annotation["image"] = image_path
|
483 |
|
484 |
all_annotations_object.append(annotation)
|
|
|
|
|
|
|
485 |
|
486 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
487 |
+
# Check if the file is an image type and the user selected text ocr option
|
488 |
+
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
489 |
+
in_redact_method = tesseract_ocr_option
|
490 |
+
|
491 |
# Convert image to a pymupdf document
|
492 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
493 |
|
|
|
496 |
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
497 |
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
498 |
|
499 |
+
file_path_str = str(file_path)
|
500 |
+
|
501 |
+
image_file_paths = process_file(file_path_str, prepare_for_review)
|
502 |
+
|
503 |
+
print("Inserted image into PDF file")
|
504 |
|
|
|
|
|
|
|
505 |
|
506 |
elif file_extension in ['.csv']:
|
507 |
review_file_csv = read_file(file)
|
508 |
+
all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
|
509 |
json_from_csv = True
|
510 |
print("Converted CSV review file to json")
|
511 |
|
|
|
625 |
out_message.append(out_time)
|
626 |
out_message_out = '\n'.join(out_message)
|
627 |
|
|
|
628 |
number_of_pages = len(image_file_paths)
|
|
|
|
|
|
|
|
|
629 |
|
630 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
631 |
|
|
|
652 |
|
653 |
return out_message, out_file_paths
|
654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
655 |
def join_values_within_threshold(df1, df2):
|
656 |
# Threshold for matching
|
657 |
threshold = 5
|
|
|
742 |
|
743 |
return df
|
744 |
|
745 |
+
def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
|
746 |
+
'''
|
747 |
+
Convert a review csv to a json file for use by the Gradio Annotation object
|
748 |
+
'''
|
749 |
# Keep only necessary columns
|
750 |
df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
751 |
|
752 |
# Group the DataFrame by the 'image' column
|
753 |
+
grouped_csv_pages = df.groupby('page')
|
754 |
|
755 |
# Create a list to hold the JSON data
|
756 |
json_data = []
|
757 |
|
758 |
+
for n, pdf_image_path in enumerate(image_paths):
|
759 |
+
reported_page_number = int(n + 1)
|
760 |
+
|
761 |
+
if reported_page_number in df["page"].values:
|
762 |
+
|
763 |
+
# Convert each relevant group to a list of box dictionaries
|
764 |
+
selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
|
765 |
+
annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
|
766 |
+
|
767 |
+
annotation = {
|
768 |
+
"image": pdf_image_path,
|
769 |
+
"boxes": annotation_boxes
|
770 |
+
}
|
771 |
+
|
772 |
+
else:
|
773 |
+
annotation = {}
|
774 |
+
annotation["image"] = pdf_image_path
|
775 |
+
|
776 |
# Append the structured data to the json_data list
|
777 |
+
json_data.append(annotation)
|
|
|
|
|
|
|
778 |
|
779 |
return json_data
|
tools/file_redaction.py
CHANGED
@@ -269,7 +269,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
269 |
print("Redacting file:", file_path_without_ext)
|
270 |
|
271 |
is_a_pdf = is_pdf(file_path) == True
|
272 |
-
if is_a_pdf == False:
|
273 |
# If user has not submitted a pdf, assume it's an image
|
274 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
275 |
in_redact_method = tesseract_ocr_option
|
@@ -708,8 +708,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
708 |
if image:
|
709 |
img_width, img_height = image.size
|
710 |
|
711 |
-
print("annot:", annot)
|
712 |
-
|
713 |
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
|
714 |
|
715 |
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
|
@@ -745,16 +743,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
745 |
"boxes": all_image_annotation_boxes
|
746 |
}
|
747 |
|
748 |
-
|
749 |
-
|
750 |
-
|
751 |
page.apply_redactions(images=0, graphics=0)
|
752 |
page.clean_contents()
|
753 |
|
754 |
return page, out_annotation_boxes
|
755 |
|
756 |
-
|
757 |
-
|
758 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
759 |
|
760 |
all_bboxes = []
|
@@ -832,7 +825,11 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
832 |
for next_box in group[1:]:
|
833 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
834 |
new_text = merged_box.text + " " + next_box.text
|
835 |
-
|
|
|
|
|
|
|
|
|
836 |
|
837 |
new_left = min(merged_box.left, next_box.left)
|
838 |
new_top = min(merged_box.top, next_box.top)
|
@@ -973,9 +970,6 @@ def redact_image_pdf(file_path:str,
|
|
973 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
974 |
#print("Current_loop_page:", current_loop_page)
|
975 |
|
976 |
-
if analysis_type == tesseract_ocr_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".csv"
|
977 |
-
elif analysis_type == textract_option: ocr_results_file_path = output_folder + "ocr_results_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.csv"
|
978 |
-
|
979 |
# If running Textract, check if file already exists. If it does, load in existing data
|
980 |
# Import results from json and convert
|
981 |
if analysis_type == textract_option:
|
@@ -984,7 +978,6 @@ def redact_image_pdf(file_path:str,
|
|
984 |
log_files_output_paths.append(json_file_path)
|
985 |
|
986 |
if not os.path.exists(json_file_path):
|
987 |
-
no_textract_file = True
|
988 |
print("No existing Textract results file found.")
|
989 |
existing_data = {}
|
990 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
@@ -1042,12 +1035,8 @@ def redact_image_pdf(file_path:str,
|
|
1042 |
|
1043 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
1044 |
if analysis_type == tesseract_ocr_option:
|
1045 |
-
|
1046 |
word_level_ocr_results = image_analyser.perform_ocr(image)
|
1047 |
-
|
1048 |
-
# Combine OCR results
|
1049 |
line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
|
1050 |
-
|
1051 |
|
1052 |
# Import results from json and convert
|
1053 |
if analysis_type == textract_option:
|
@@ -1085,44 +1074,6 @@ def redact_image_pdf(file_path:str,
|
|
1085 |
text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
1086 |
|
1087 |
|
1088 |
-
# if not os.path.exists(json_file_path):
|
1089 |
-
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1090 |
-
# log_files_output_paths.append(json_file_path)
|
1091 |
-
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1092 |
-
|
1093 |
-
# existing_data = {"pages":[text_blocks]}
|
1094 |
-
|
1095 |
-
|
1096 |
-
# else:
|
1097 |
-
# # Open the file and load the JSON data
|
1098 |
-
# print("Found existing Textract json results file.")
|
1099 |
-
# with open(json_file_path, 'r') as json_file:
|
1100 |
-
# existing_data = json.load(json_file)
|
1101 |
-
|
1102 |
-
# # Check if the current reported_page_number exists in the loaded JSON
|
1103 |
-
# page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
|
1104 |
-
|
1105 |
-
# if not page_exists: # If the page does not exist, analyze again
|
1106 |
-
# print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
1107 |
-
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1108 |
-
|
1109 |
-
# # Check if "pages" key exists, if not, initialize it as an empty list
|
1110 |
-
# if "pages" not in existing_data:
|
1111 |
-
# existing_data["pages"] = []
|
1112 |
-
|
1113 |
-
# # Append the new page data
|
1114 |
-
# existing_data["pages"].append(text_blocks)
|
1115 |
-
|
1116 |
-
# # Write the updated existing_data back to the JSON file
|
1117 |
-
# with open(json_file_path, 'w') as json_file:
|
1118 |
-
# json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1119 |
-
|
1120 |
-
# log_files_output_paths.append(json_file_path)
|
1121 |
-
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1122 |
-
# else:
|
1123 |
-
# # If the page exists, retrieve the data
|
1124 |
-
# text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
1125 |
-
|
1126 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1127 |
|
1128 |
# Step 2: Analyze text and identify PII
|
@@ -1194,7 +1145,8 @@ def redact_image_pdf(file_path:str,
|
|
1194 |
else:
|
1195 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1196 |
if redact_whole_page_list:
|
1197 |
-
|
|
|
1198 |
else: redact_whole_page = False
|
1199 |
else: redact_whole_page = False
|
1200 |
|
@@ -1345,8 +1297,14 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1345 |
character_objects_out.append(char) # Collect character objects
|
1346 |
|
1347 |
if isinstance(char, LTAnno):
|
|
|
|
|
|
|
|
|
|
|
|
|
1348 |
# Handle space separately by finalizing the word
|
1349 |
-
full_text +=
|
1350 |
|
1351 |
if current_word: # Only finalize if there is a current word
|
1352 |
word_bboxes.append((current_word, current_word_bbox))
|
@@ -1354,7 +1312,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1354 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
1355 |
|
1356 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1357 |
-
if '\n' in
|
1358 |
#print("char_anno:", char)
|
1359 |
# Finalize the current line
|
1360 |
if current_word:
|
@@ -1373,7 +1331,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1373 |
|
1374 |
# Concatenate text for LTChar
|
1375 |
|
1376 |
-
|
1377 |
#full_text += char.get_text()
|
1378 |
#added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
|
1379 |
added_text = char.get_text()
|
@@ -1382,8 +1339,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1382 |
added_text = clean_unicode_text(added_text)
|
1383 |
full_text += added_text # Adds space or newline, removing
|
1384 |
|
1385 |
-
|
1386 |
-
|
1387 |
# Update overall bounding box
|
1388 |
x0, y0, x1, y1 = char.bbox
|
1389 |
overall_bbox[0] = min(overall_bbox[0], x0) # x0
|
@@ -1480,7 +1435,10 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
|
|
1480 |
merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
|
1481 |
merged_result.end = max(current_result.end, result.end) # Extend text range
|
1482 |
try:
|
1483 |
-
|
|
|
|
|
|
|
1484 |
except Exception as e:
|
1485 |
print("Unable to combine result entity types:", e)
|
1486 |
if current_text:
|
@@ -1877,8 +1835,9 @@ def redact_text_pdf(
|
|
1877 |
|
1878 |
# Make pymupdf page redactions
|
1879 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1880 |
-
if redact_whole_page_list:
|
1881 |
-
|
|
|
1882 |
else: redact_whole_page = False
|
1883 |
else: redact_whole_page = False
|
1884 |
|
|
|
269 |
print("Redacting file:", file_path_without_ext)
|
270 |
|
271 |
is_a_pdf = is_pdf(file_path) == True
|
272 |
+
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
273 |
# If user has not submitted a pdf, assume it's an image
|
274 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
275 |
in_redact_method = tesseract_ocr_option
|
|
|
708 |
if image:
|
709 |
img_width, img_height = image.size
|
710 |
|
|
|
|
|
711 |
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
|
712 |
|
713 |
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
|
|
|
743 |
"boxes": all_image_annotation_boxes
|
744 |
}
|
745 |
|
|
|
|
|
|
|
746 |
page.apply_redactions(images=0, graphics=0)
|
747 |
page.clean_contents()
|
748 |
|
749 |
return page, out_annotation_boxes
|
750 |
|
|
|
|
|
751 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
752 |
|
753 |
all_bboxes = []
|
|
|
825 |
for next_box in group[1:]:
|
826 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
827 |
new_text = merged_box.text + " " + next_box.text
|
828 |
+
|
829 |
+
if merged_box.entity_type != next_box.entity_type:
|
830 |
+
new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
|
831 |
+
else:
|
832 |
+
new_entity_type = merged_box.entity_type
|
833 |
|
834 |
new_left = min(merged_box.left, next_box.left)
|
835 |
new_top = min(merged_box.top, next_box.top)
|
|
|
970 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
971 |
#print("Current_loop_page:", current_loop_page)
|
972 |
|
|
|
|
|
|
|
973 |
# If running Textract, check if file already exists. If it does, load in existing data
|
974 |
# Import results from json and convert
|
975 |
if analysis_type == textract_option:
|
|
|
978 |
log_files_output_paths.append(json_file_path)
|
979 |
|
980 |
if not os.path.exists(json_file_path):
|
|
|
981 |
print("No existing Textract results file found.")
|
982 |
existing_data = {}
|
983 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
|
|
1035 |
|
1036 |
# Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
|
1037 |
if analysis_type == tesseract_ocr_option:
|
|
|
1038 |
word_level_ocr_results = image_analyser.perform_ocr(image)
|
|
|
|
|
1039 |
line_level_ocr_results, line_level_ocr_results_with_children = combine_ocr_results(word_level_ocr_results)
|
|
|
1040 |
|
1041 |
# Import results from json and convert
|
1042 |
if analysis_type == textract_option:
|
|
|
1074 |
text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
1075 |
|
1076 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1077 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1078 |
|
1079 |
# Step 2: Analyze text and identify PII
|
|
|
1145 |
else:
|
1146 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1147 |
if redact_whole_page_list:
|
1148 |
+
int_reported_page_number = int(reported_page_number)
|
1149 |
+
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
1150 |
else: redact_whole_page = False
|
1151 |
else: redact_whole_page = False
|
1152 |
|
|
|
1297 |
character_objects_out.append(char) # Collect character objects
|
1298 |
|
1299 |
if isinstance(char, LTAnno):
|
1300 |
+
|
1301 |
+
added_text = char.get_text()
|
1302 |
+
|
1303 |
+
# Handle double quotes
|
1304 |
+
added_text = added_text.replace('"', '\\"') # Escape double quotes
|
1305 |
+
|
1306 |
# Handle space separately by finalizing the word
|
1307 |
+
full_text += added_text # Adds space or newline
|
1308 |
|
1309 |
if current_word: # Only finalize if there is a current word
|
1310 |
word_bboxes.append((current_word, current_word_bbox))
|
|
|
1312 |
current_word_bbox = [float('inf'), float('inf'), float('-inf'), float('-inf')] # Reset for next word
|
1313 |
|
1314 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1315 |
+
if '\n' in added_text:
|
1316 |
#print("char_anno:", char)
|
1317 |
# Finalize the current line
|
1318 |
if current_word:
|
|
|
1331 |
|
1332 |
# Concatenate text for LTChar
|
1333 |
|
|
|
1334 |
#full_text += char.get_text()
|
1335 |
#added_text = re.sub(r'[^\x00-\x7F]+', ' ', char.get_text())
|
1336 |
added_text = char.get_text()
|
|
|
1339 |
added_text = clean_unicode_text(added_text)
|
1340 |
full_text += added_text # Adds space or newline, removing
|
1341 |
|
|
|
|
|
1342 |
# Update overall bounding box
|
1343 |
x0, y0, x1, y1 = char.bbox
|
1344 |
overall_bbox[0] = min(overall_bbox[0], x0) # x0
|
|
|
1435 |
merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
|
1436 |
merged_result.end = max(current_result.end, result.end) # Extend text range
|
1437 |
try:
|
1438 |
+
if current_result.entity_type != result.entity_type:
|
1439 |
+
merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
1440 |
+
else:
|
1441 |
+
merged_result.entity_type = current_result.entity_type
|
1442 |
except Exception as e:
|
1443 |
print("Unable to combine result entity types:", e)
|
1444 |
if current_text:
|
|
|
1835 |
|
1836 |
# Make pymupdf page redactions
|
1837 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
1838 |
+
if redact_whole_page_list:
|
1839 |
+
int_reported_page_number = int(reported_page_number)
|
1840 |
+
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
1841 |
else: redact_whole_page = False
|
1842 |
else: redact_whole_page = False
|
1843 |
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
@@ -24,14 +24,22 @@ except:
|
|
24 |
print("Successfully downloaded and imported spaCy model", model_name)
|
25 |
|
26 |
# #### Custom recognisers
|
27 |
-
# Allow user to create their own recogniser
|
28 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
29 |
-
|
30 |
-
custom_pattern = Pattern(name="custom_pattern",regex=custom_regex, score = 1)
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
|
34 |
-
|
35 |
|
36 |
return custom_recogniser
|
37 |
|
|
|
24 |
print("Successfully downloaded and imported spaCy model", model_name)
|
25 |
|
26 |
# #### Custom recognisers
|
|
|
27 |
def custom_word_list_recogniser(custom_list:List[str]=[]):
|
28 |
+
# Create regex pattern, handling quotes carefully
|
|
|
29 |
|
30 |
+
quote_str = '"'
|
31 |
+
replace_str = '(?:"|"|")'
|
32 |
+
|
33 |
+
custom_regex = '|'.join(
|
34 |
+
rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
|
35 |
+
for term in custom_list
|
36 |
+
)
|
37 |
+
print(custom_regex)
|
38 |
+
|
39 |
+
custom_pattern = Pattern(name="custom_pattern", regex=custom_regex, score = 1)
|
40 |
+
|
41 |
custom_recogniser = PatternRecognizer(supported_entity="CUSTOM", name="CUSTOM", patterns = [custom_pattern],
|
42 |
+
global_regex_flags=re.DOTALL | re.MULTILINE | re.IGNORECASE)
|
43 |
|
44 |
return custom_recogniser
|
45 |
|
tools/redaction_review.py
CHANGED
@@ -49,18 +49,12 @@ def update_zoom(current_zoom_level:int, annotate_current_page:int, decrease:bool
|
|
49 |
|
50 |
return current_zoom_level, annotate_current_page
|
51 |
|
52 |
-
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=
|
53 |
'''
|
54 |
Update a gradio_image_annotation object with new annotation data
|
55 |
'''
|
56 |
recogniser_entities = []
|
57 |
recogniser_dataframe = pd.DataFrame()
|
58 |
-
#recogniser_entities_drop = gr.Dropdown(value="ALL", allow_custom_value=True)
|
59 |
-
#recogniser_dataframe_gr = gr.Dataframe(pd.DataFrame(data={"page":[""], "label":[""]}))
|
60 |
-
|
61 |
-
#print("recogniser_dataframe_gr", recogniser_dataframe_gr)
|
62 |
-
#print("recogniser_dataframe_gr shape", recogniser_dataframe_gr.shape)
|
63 |
-
#print("recogniser_dataframe_gr.iloc[0,0]:", recogniser_dataframe_gr.iloc[0,0])
|
64 |
|
65 |
if recogniser_dataframe_gr.iloc[0,0] == "":
|
66 |
try:
|
@@ -228,7 +222,7 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
228 |
|
229 |
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
|
230 |
|
231 |
-
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, progress=gr.Progress(track_tqdm=True)):
|
232 |
'''
|
233 |
Apply modified redactions to a pymupdf and export review files
|
234 |
'''
|
@@ -251,75 +245,76 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
251 |
file_paths = [file_paths]
|
252 |
|
253 |
for file_path in file_paths:
|
254 |
-
print("file_path:", file_path)
|
255 |
file_base = get_file_path_end(file_path)
|
256 |
|
257 |
file_extension = os.path.splitext(file_path)[1].lower()
|
258 |
|
259 |
-
|
260 |
-
|
261 |
-
|
|
|
262 |
|
263 |
-
|
264 |
|
265 |
-
|
266 |
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
|
273 |
-
|
274 |
|
275 |
-
|
276 |
|
277 |
-
|
278 |
|
279 |
-
|
280 |
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
|
289 |
-
|
290 |
|
291 |
-
|
292 |
|
293 |
-
|
294 |
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
|
312 |
-
|
313 |
-
|
314 |
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
|
324 |
try:
|
325 |
print("Saving annotations to JSON")
|
@@ -331,7 +326,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
331 |
|
332 |
print("Saving annotations to CSV review file")
|
333 |
|
334 |
-
print("review_file_state:", review_file_state)
|
335 |
|
336 |
# Convert json to csv and also save this
|
337 |
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
|
|
|
49 |
|
50 |
return current_zoom_level, annotate_current_page
|
51 |
|
52 |
+
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
|
53 |
'''
|
54 |
Update a gradio_image_annotation object with new annotation data
|
55 |
'''
|
56 |
recogniser_entities = []
|
57 |
recogniser_dataframe = pd.DataFrame()
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
if recogniser_dataframe_gr.iloc[0,0] == "":
|
60 |
try:
|
|
|
222 |
|
223 |
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
|
224 |
|
225 |
+
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
|
226 |
'''
|
227 |
Apply modified redactions to a pymupdf and export review files
|
228 |
'''
|
|
|
245 |
file_paths = [file_paths]
|
246 |
|
247 |
for file_path in file_paths:
|
248 |
+
#print("file_path:", file_path)
|
249 |
file_base = get_file_path_end(file_path)
|
250 |
|
251 |
file_extension = os.path.splitext(file_path)[1].lower()
|
252 |
|
253 |
+
if save_pdf == True:
|
254 |
+
# If working with image docs
|
255 |
+
if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
|
256 |
+
image = Image.open(file_paths[-1])
|
257 |
|
258 |
+
#image = pdf_doc
|
259 |
|
260 |
+
draw = ImageDraw.Draw(image)
|
261 |
|
262 |
+
for img_annotation_box in image_annotated['boxes']:
|
263 |
+
coords = [img_annotation_box["xmin"],
|
264 |
+
img_annotation_box["ymin"],
|
265 |
+
img_annotation_box["xmax"],
|
266 |
+
img_annotation_box["ymax"]]
|
267 |
|
268 |
+
fill = img_annotation_box["color"]
|
269 |
|
270 |
+
draw.rectangle(coords, fill=fill)
|
271 |
|
272 |
+
image.save(output_folder + file_base + "_redacted.png")
|
273 |
|
274 |
+
doc = [image]
|
275 |
|
276 |
+
elif file_extension in '.csv':
|
277 |
+
print("This is a csv")
|
278 |
+
pdf_doc = []
|
279 |
|
280 |
+
# If working with pdfs
|
281 |
+
elif is_pdf(file_path) == True:
|
282 |
+
pdf_doc = pymupdf.open(file_path)
|
283 |
|
284 |
+
number_of_pages = pdf_doc.page_count
|
285 |
|
286 |
+
print("Saving pages to file.")
|
287 |
|
288 |
+
for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
|
289 |
|
290 |
+
#print("Saving page", str(i))
|
291 |
+
|
292 |
+
image_loc = all_image_annotations[i]['image']
|
293 |
+
#print("Image location:", image_loc)
|
294 |
|
295 |
+
# Load in image object
|
296 |
+
if isinstance(image_loc, np.ndarray):
|
297 |
+
image = Image.fromarray(image_loc.astype('uint8'))
|
298 |
+
#all_image_annotations[i]['image'] = image_loc.tolist()
|
299 |
+
elif isinstance(image_loc, Image.Image):
|
300 |
+
image = image_loc
|
301 |
+
#image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
|
302 |
+
#image_loc.save(image_out_folder)
|
303 |
+
#all_image_annotations[i]['image'] = image_out_folder
|
304 |
+
elif isinstance(image_loc, str):
|
305 |
+
image = Image.open(image_loc)
|
306 |
|
307 |
+
pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
|
308 |
+
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
309 |
|
310 |
+
else:
|
311 |
+
print("File type not recognised.")
|
312 |
+
|
313 |
+
#try:
|
314 |
+
if pdf_doc:
|
315 |
+
out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
|
316 |
+
pdf_doc.save(out_pdf_file_path)
|
317 |
+
output_files.append(out_pdf_file_path)
|
318 |
|
319 |
try:
|
320 |
print("Saving annotations to JSON")
|
|
|
326 |
|
327 |
print("Saving annotations to CSV review file")
|
328 |
|
329 |
+
#print("review_file_state:", review_file_state)
|
330 |
|
331 |
# Convert json to csv and also save this
|
332 |
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
|