Merge pull request #11 from seanpedrick-case/dev
Browse files- .dockerignore +1 -1
- .gitignore +1 -0
- Dockerfile +1 -0
- README.md +32 -2
- app.py +41 -40
- how_to_create_exe_dist.txt +4 -2
- requirements.txt +17 -16
- tools/aws_functions.py +10 -1
- tools/custom_image_analyser_engine.py +1 -5
- tools/file_conversion.py +0 -4
- tools/file_redaction.py +64 -130
- tools/redaction_review.py +2 -2
.dockerignore
CHANGED
@@ -16,5 +16,5 @@ build/*
|
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
logs/*
|
19 |
-
|
20 |
user_guide/*
|
|
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
logs/*
|
19 |
+
config/*
|
20 |
user_guide/*
|
.gitignore
CHANGED
@@ -16,5 +16,6 @@ build/*
|
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
logs/*
|
|
|
19 |
doc_redaction_amplify_app/*
|
20 |
user_guide/*
|
|
|
16 |
dist/*
|
17 |
build_deps/*
|
18 |
logs/*
|
19 |
+
config/*
|
20 |
doc_redaction_amplify_app/*
|
21 |
user_guide/*
|
Dockerfile
CHANGED
@@ -56,6 +56,7 @@ RUN mkdir -p /home/user/app/output \
|
|
56 |
&& mkdir -p /home/user/app/input \
|
57 |
&& mkdir -p /home/user/app/tld \
|
58 |
&& mkdir -p /home/user/app/logs \
|
|
|
59 |
&& chown -R user:user /home/user/app
|
60 |
|
61 |
# Copy installed packages from builder stage
|
|
|
56 |
&& mkdir -p /home/user/app/input \
|
57 |
&& mkdir -p /home/user/app/tld \
|
58 |
&& mkdir -p /home/user/app/logs \
|
59 |
+
&& mkdir -p /home/user/app/config \
|
60 |
&& chown -R user:user /home/user/app
|
61 |
|
62 |
# Copy installed packages from builder stage
|
README.md
CHANGED
@@ -34,7 +34,16 @@ NOTE: The app is not 100% accurate, and it will miss some personal information.
|
|
34 |
- [Handwriting and signature redaction](#handwriting-and-signature-redaction)
|
35 |
- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
|
36 |
|
37 |
-
See the [advanced user guide here](#advanced-user-guide)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
## Example data files
|
40 |
|
@@ -292,4 +301,25 @@ The app also allows you to import .xfdf files from Adobe Acrobat. To do this, go
|
|
292 |
|
293 |
When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
|
294 |
|
295 |
-

|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
- [Handwriting and signature redaction](#handwriting-and-signature-redaction)
|
35 |
- [Reviewing and modifying suggested redactions](#reviewing-and-modifying-suggested-redactions)
|
36 |
|
37 |
+
See the [advanced user guide here](#advanced-user-guide):
|
38 |
+
- [Modifying and merging redaction review files](#modifying-and-merging-redaction-review-files)
|
39 |
+
- [Modifying existing redaction review files](#modifying-existing-redaction-review-files)
|
40 |
+
- [Merging existing redaction review files](#merging-existing-redaction-review-files)
|
41 |
+
- [Identifying and redacting duplicate pages](#identifying-and-redacting-duplicate-pages)
|
42 |
+
- [Fuzzy search and redaction](#fuzzy-search-and-redaction)
|
43 |
+
- [Export redactions to and import from Adobe Acrobat](#export-to-and-import-from-adobe)
|
44 |
+
- [Exporting to Adobe Acrobat](#exporting-to-adobe-acrobat)
|
45 |
+
- [Importing from Adobe Acrobat](#importing-from-adobe-acrobat)
|
46 |
+
- [Using AWS Textract and Comprehend when not running in an AWS environment](#using-aws-textract-and-comprehend-when-not-running-in-an-aws-environment)
|
47 |
|
48 |
## Example data files
|
49 |
|
|
|
301 |
|
302 |
When you click the 'convert .xfdf comment file to review_file.csv' button, the app should take you up to the top of the screen where the new review file has been created and can be downloaded.
|
303 |
|
304 |
+

|
305 |
+
|
306 |
+
## Using AWS Textract and Comprehend when not running in an AWS environment
|
307 |
+
|
308 |
+
AWS Textract and Comprehend give much better results for text extraction and document redaction than the local model options in the app. The most secure way to access them in the Redaction app is to run the app in a secure AWS environment with relevant permissions. Alternatively, you could run the app on your own system while logged in to AWS SSO with relevant permissions.
|
309 |
+
|
310 |
+
However, it is possible to access these services directly via API from outside an AWS environment by creating IAM users and access keys with relevant permissions to access AWS Textract and Comprehend services. Please check with your IT and data security teams that this approach is acceptable for your data before trying the following approaches.
|
311 |
+
|
312 |
+
To do the following, in your AWS environment you will need to create a new user with permissions for "textract:AnalyzeDocument", "textract:DetectDocumentText", and "comprehend:DetectPiiEntities". Under security credentials, create new access keys - note down the access key and secret key.
|
313 |
+
|
314 |
+
### Direct access by passing AWS access keys through app
|
315 |
+
The Redaction Settings tab now has boxes for entering the AWS access key and secret key. If you paste the relevant keys into these boxes before performing redaction, you should be able to use these services in the app.
|
316 |
+
|
317 |
+
### Picking up AWS access keys through an .env file
|
318 |
+
The app also has the capability of picking up AWS access key details through a .env file located in a '/config/aws_config.env' file (default), or alternative .env file location specified by the environment variable AWS_CONFIG_PATH. The env file should look like the following with just two lines:
|
319 |
+
|
320 |
+
AWS_ACCESS_KEY=<your-access-key>
|
321 |
+
AWS_SECRET_KEY=<your-secret-key>
|
322 |
+
|
323 |
+
The app should then pick up these keys when trying to access the AWS Textract and Comprehend services during redaction.
|
324 |
+
|
325 |
+
Again, a lot can potentially go wrong with AWS solutions that are insecure, so before trying the above please consult with your AWS and data security teams.
|
app.py
CHANGED
@@ -2,7 +2,7 @@ import os
|
|
2 |
import socket
|
3 |
|
4 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
5 |
-
os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
@@ -65,7 +65,8 @@ with app:
|
|
65 |
###
|
66 |
# STATE VARIABLES
|
67 |
###
|
68 |
-
|
|
|
69 |
pdf_doc_state = gr.State([])
|
70 |
all_image_annotations_state = gr.State([])
|
71 |
|
@@ -73,12 +74,12 @@ with app:
|
|
73 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
74 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
75 |
|
76 |
-
session_hash_state = gr.State()
|
77 |
-
s3_output_folder_state = gr.State()
|
78 |
|
79 |
-
first_loop_state = gr.State(True)
|
80 |
-
second_loop_state = gr.State(False)
|
81 |
-
do_not_save_pdf_state = gr.State(False)
|
82 |
|
83 |
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
84 |
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
|
@@ -92,12 +93,12 @@ with app:
|
|
92 |
# Logging state
|
93 |
log_file_name = 'log.csv'
|
94 |
|
95 |
-
feedback_logs_state = gr.State(feedback_logs_folder + log_file_name)
|
96 |
-
feedback_s3_logs_loc_state = gr.State(feedback_logs_folder)
|
97 |
-
access_logs_state = gr.State(access_logs_folder + log_file_name)
|
98 |
-
access_s3_logs_loc_state = gr.State(access_logs_folder)
|
99 |
-
usage_logs_state = gr.State(usage_logs_folder + log_file_name)
|
100 |
-
usage_s3_logs_loc_state = gr.State(usage_logs_folder)
|
101 |
|
102 |
# Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
103 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
@@ -121,11 +122,11 @@ with app:
|
|
121 |
|
122 |
## Annotator zoom value
|
123 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
124 |
-
zoom_true_bool = gr.State(True)
|
125 |
-
zoom_false_bool = gr.State(False)
|
126 |
|
127 |
-
clear_all_page_redactions = gr.State(True)
|
128 |
-
prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
|
129 |
|
130 |
## Settings page variables
|
131 |
default_allow_list_file_name = "default_allow_list.csv"
|
@@ -148,11 +149,11 @@ with app:
|
|
148 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
149 |
|
150 |
# Base dataframe for recognisers that is not modified subsequent to load
|
151 |
-
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
|
152 |
|
153 |
# Duplicate page detection
|
154 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
155 |
-
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="
|
156 |
|
157 |
|
158 |
|
@@ -177,12 +178,12 @@ with app:
|
|
177 |
with gr.Tab("Redact PDFs/images"):
|
178 |
with gr.Accordion("Redact document", open = True):
|
179 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
180 |
-
if RUN_AWS_FUNCTIONS == "1":
|
181 |
-
|
182 |
-
|
183 |
-
else:
|
184 |
-
|
185 |
-
|
186 |
|
187 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the redaction settings tab.""")
|
188 |
document_redact_btn = gr.Button("Redact document", variant="primary")
|
@@ -336,14 +337,14 @@ with app:
|
|
336 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
337 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
338 |
|
339 |
-
with gr.Accordion("AWS Textract
|
340 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
341 |
#with gr.Row():
|
342 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
343 |
|
344 |
with gr.Row():
|
345 |
-
aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=
|
346 |
-
aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=
|
347 |
|
348 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
349 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
@@ -355,8 +356,6 @@ with app:
|
|
355 |
merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
|
356 |
|
357 |
|
358 |
-
|
359 |
-
|
360 |
### UI INTERACTION ###
|
361 |
|
362 |
###
|
@@ -365,14 +364,13 @@ with app:
|
|
365 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
366 |
|
367 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
368 |
-
|
369 |
-
|
370 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
371 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
372 |
|
373 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
374 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox],
|
375 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
|
376 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
377 |
|
378 |
# If a file has been completed, the function will continue onto the next document
|
@@ -386,7 +384,7 @@ with app:
|
|
386 |
# Upload previous files for modifying redactions
|
387 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
388 |
then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
389 |
-
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method,
|
390 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
391 |
|
392 |
# Page controls at top
|
@@ -445,12 +443,12 @@ with app:
|
|
445 |
|
446 |
# Convert review file to xfdf Adobe format
|
447 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
448 |
-
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method,
|
449 |
then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
|
450 |
|
451 |
# Convert xfdf Adobe file back to review_file.csv
|
452 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
453 |
-
then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method,
|
454 |
then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
|
455 |
|
456 |
###
|
@@ -542,14 +540,17 @@ print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
|
|
542 |
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
|
543 |
print(f'The value of ROOT_PATH is {ROOT_PATH}')
|
544 |
|
|
|
|
|
|
|
545 |
if __name__ == "__main__":
|
546 |
|
547 |
if RUN_DIRECT_MODE == "0":
|
548 |
|
549 |
if os.environ['COGNITO_AUTH'] == "1":
|
550 |
-
app.queue(max_size=MAX_QUEUE_SIZE).launch(show_error=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
|
551 |
else:
|
552 |
-
app.queue(max_size=MAX_QUEUE_SIZE).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
|
553 |
|
554 |
else:
|
555 |
from tools.cli_redact import main
|
|
|
2 |
import socket
|
3 |
|
4 |
# By default TLDExtract will try to pull files from the internet. I have instead downloaded this file locally to avoid the requirement for an internet connection.
|
5 |
+
#os.environ['TLDEXTRACT_CACHE'] = 'tld/.tld_set_snapshot'
|
6 |
|
7 |
import gradio as gr
|
8 |
import pandas as pd
|
|
|
65 |
###
|
66 |
# STATE VARIABLES
|
67 |
###
|
68 |
+
|
69 |
+
# Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
|
70 |
pdf_doc_state = gr.State([])
|
71 |
all_image_annotations_state = gr.State([])
|
72 |
|
|
|
74 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
75 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
76 |
|
77 |
+
session_hash_state = gr.Textbox(label= "session_hash_state", value="", visible=False) #.State()
|
78 |
+
s3_output_folder_state = gr.Textbox(label= "s3_output_folder_state", value="", visible=False) #.State()
|
79 |
|
80 |
+
first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False) #.State(True)
|
81 |
+
second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False) #.State(False)
|
82 |
+
do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False) #.State(False)
|
83 |
|
84 |
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([])
|
85 |
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False) #gr.State([]) # List of pdf pages converted to PIL images
|
|
|
93 |
# Logging state
|
94 |
log_file_name = 'log.csv'
|
95 |
|
96 |
+
feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=feedback_logs_folder + log_file_name, visible=False) #State(feedback_logs_folder + log_file_name)
|
97 |
+
feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=feedback_logs_folder, visible=False) #State(feedback_logs_folder)
|
98 |
+
access_logs_state = gr.Textbox(label= "access_logs_state", value=access_logs_folder + log_file_name, visible=False) #State(access_logs_folder + log_file_name)
|
99 |
+
access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=access_logs_folder, visible=False) #State(access_logs_folder)
|
100 |
+
usage_logs_state = gr.Textbox(label= "usage_logs_state", value=usage_logs_folder + log_file_name, visible=False) #State(usage_logs_folder + log_file_name)
|
101 |
+
usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=usage_logs_folder, visible=False) #State(usage_logs_folder)
|
102 |
|
103 |
# Invisible text boxes to hold the session hash/username, Textract request metadata, data file names just for logging purposes.
|
104 |
session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
|
|
|
122 |
|
123 |
## Annotator zoom value
|
124 |
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
125 |
+
zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False) #State(True)
|
126 |
+
zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False) #State(False)
|
127 |
|
128 |
+
clear_all_page_redactions = gr.Checkbox(label="clear_all_page_redactions", value=True, visible=False) #State(True)
|
129 |
+
prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
|
130 |
|
131 |
## Settings page variables
|
132 |
default_allow_list_file_name = "default_allow_list.csv"
|
|
|
149 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
150 |
|
151 |
# Base dataframe for recognisers that is not modified subsequent to load
|
152 |
+
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False, label="recogniser_entity_dataframe_base")
|
153 |
|
154 |
# Duplicate page detection
|
155 |
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
156 |
+
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
|
157 |
|
158 |
|
159 |
|
|
|
178 |
with gr.Tab("Redact PDFs/images"):
|
179 |
with gr.Accordion("Redact document", open = True):
|
180 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
181 |
+
# if RUN_AWS_FUNCTIONS == "1":
|
182 |
+
in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
|
183 |
+
pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
|
184 |
+
# else:
|
185 |
+
# in_redaction_method = gr.Radio(label="Choose text extraction method.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option])
|
186 |
+
# pii_identification_method_drop = gr.Radio(label = "Choose PII detection method.", value = default_pii_detector, choices=[local_pii_detector], visible=False)
|
187 |
|
188 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the redaction settings tab.""")
|
189 |
document_redact_btn = gr.Button("Redact document", variant="primary")
|
|
|
337 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
338 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
339 |
|
340 |
+
with gr.Accordion("AWS Textract options", open = False):
|
341 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
342 |
#with gr.Row():
|
343 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
344 |
|
345 |
with gr.Row():
|
346 |
+
aws_access_key_textbox = gr.Textbox(value='', label="AWS access key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
|
347 |
+
aws_secret_key_textbox = gr.Textbox(value='', label="AWS secret key for account with permissions for AWS Textract and Comprehend", visible=True, type="password")
|
348 |
|
349 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
350 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
|
|
356 |
merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
|
357 |
|
358 |
|
|
|
|
|
359 |
### UI INTERACTION ###
|
360 |
|
361 |
###
|
|
|
364 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
365 |
|
366 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
367 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
|
368 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
|
|
|
369 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
370 |
|
371 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
372 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state],
|
373 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
|
374 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
375 |
|
376 |
# If a file has been completed, the function will continue onto the next document
|
|
|
384 |
# Upload previous files for modifying redactions
|
385 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
386 |
then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
387 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
388 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
389 |
|
390 |
# Page controls at top
|
|
|
443 |
|
444 |
# Convert review file to xfdf Adobe format
|
445 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
446 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
447 |
then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
|
448 |
|
449 |
# Convert xfdf Adobe file back to review_file.csv
|
450 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
451 |
+
then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
452 |
then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
|
453 |
|
454 |
###
|
|
|
540 |
ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
|
541 |
print(f'The value of ROOT_PATH is {ROOT_PATH}')
|
542 |
|
543 |
+
DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
|
544 |
+
print(f'The value of DEFAULT_CONCURRENCY_LIMIT is {DEFAULT_CONCURRENCY_LIMIT}')
|
545 |
+
|
546 |
if __name__ == "__main__":
|
547 |
|
548 |
if RUN_DIRECT_MODE == "0":
|
549 |
|
550 |
if os.environ['COGNITO_AUTH'] == "1":
|
551 |
+
app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
|
552 |
else:
|
553 |
+
app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
|
554 |
|
555 |
else:
|
556 |
from tools.cli_redact import main
|
how_to_create_exe_dist.txt
CHANGED
@@ -1,3 +1,5 @@
|
|
|
|
|
|
1 |
1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
|
2 |
|
3 |
2. Activate the environment 'conda activate new_env'
|
@@ -14,7 +16,7 @@ NOTE: for ensuring that spaCy models are loaded into the program correctly in re
|
|
14 |
|
15 |
9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
16 |
|
17 |
-
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.
|
18 |
|
19 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
20 |
|
@@ -30,7 +32,7 @@ a = Analysis(
|
|
30 |
|
31 |
hook-presidio-image-redactor.py
|
32 |
|
33 |
-
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.
|
34 |
|
35 |
|
36 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
|
|
|
1 |
+
Here are instructions for creating an .exe runnable version of the redaction app. Tested until Gradio version 5.17.0
|
2 |
+
|
3 |
1. Create minimal environment to run the app in conda. E.g. 'conda create --name new_env'
|
4 |
|
5 |
2. Activate the environment 'conda activate new_env'
|
|
|
16 |
|
17 |
9.Run the following (This helped me: https://github.com/pyinstaller/pyinstaller/issues/8108):
|
18 |
|
19 |
+
a) In command line: pyi-makespec --additional-hooks-dir="build_deps" --add-data "tesseract/:tesseract/" --add-data "poppler/poppler-24.02.0/:poppler/poppler-24.02.0/" --collect-data=gradio_client --collect-data=gradio --hidden-import=gradio_image_annotation --collect-data=gradio_image_annotation --collect-all=gradio_image_annotation --hidden-import pyarrow.vendored.version --hidden-import pydicom.encoders --hidden-import=safehttpx --collect-all=safehttpx --hidden-import=presidio_analyzer --collect-all=presidio_analyzer --hidden-import=presidio_anonymizer --collect-all=presidio_anonymizer --hidden-import=presidio_image_redactor --collect-all=presidio_image_redactor --name DocRedactApp_0.3.0 app.py
|
20 |
|
21 |
# Add --onefile to the above if you would like everything packaged as a single exe, although this will need to be extracted upon starting the app, slowing down initialisation time significantly.
|
22 |
|
|
|
32 |
|
33 |
hook-presidio-image-redactor.py
|
34 |
|
35 |
+
c) Back in command line, run this: pyinstaller --clean --noconfirm DocRedactApp_0.3.0.spec
|
36 |
|
37 |
|
38 |
9. A 'dist' folder will be created with the executable inside along with all dependencies('dist\redaction').
|
requirements.txt
CHANGED
@@ -1,30 +1,31 @@
|
|
1 |
-
pdfminer.six==
|
2 |
pdf2image==1.17.0
|
3 |
-
pymupdf==1.
|
4 |
opencv-python==4.10.0.84
|
5 |
-
presidio_analyzer==2.2.
|
6 |
-
presidio_anonymizer==2.2.
|
7 |
-
presidio-image-redactor==0.0.
|
8 |
-
pikepdf==
|
9 |
pandas==2.2.3
|
10 |
nltk==3.9.1
|
11 |
-
scikit-learn==1.
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
-
gradio==5.
|
16 |
-
boto3==1.36.
|
17 |
-
pyarrow==
|
18 |
-
openpyxl==3.1.
|
19 |
-
Faker==
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
-
gradio_image_annotation==0.2.5
|
23 |
-
# The following version includes rotation and image zoom options
|
24 |
-
|
25 |
rapidfuzz==3.12.1
|
|
|
26 |
numpy==1.26.4
|
27 |
-
awslambdaric==3.0.
|
28 |
|
29 |
|
30 |
|
|
|
1 |
+
pdfminer.six==20240706
|
2 |
pdf2image==1.17.0
|
3 |
+
pymupdf==1.25.3
|
4 |
opencv-python==4.10.0.84
|
5 |
+
presidio_analyzer==2.2.357
|
6 |
+
presidio_anonymizer==2.2.357
|
7 |
+
presidio-image-redactor==0.0.55
|
8 |
+
pikepdf==9.5.2
|
9 |
pandas==2.2.3
|
10 |
nltk==3.9.1
|
11 |
+
scikit-learn==1.6.1
|
12 |
spacy==3.8.3
|
13 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
14 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
15 |
+
gradio==5.18.0
|
16 |
+
boto3==1.36.26
|
17 |
+
pyarrow==19.0.1
|
18 |
+
openpyxl==3.1.5
|
19 |
+
Faker==36.1.1
|
20 |
python-levenshtein==0.26.1
|
21 |
spaczz==0.6.1
|
22 |
+
#gradio_image_annotation==0.2.5
|
23 |
+
# The following version includes rotation and image zoom options
|
24 |
+
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.0/gradio_image_annotation-0.3.0-py3-none-any.whl
|
25 |
rapidfuzz==3.12.1
|
26 |
+
python-dotenv==1.0.1
|
27 |
numpy==1.26.4
|
28 |
+
awslambdaric==3.0.1
|
29 |
|
30 |
|
31 |
|
tools/aws_functions.py
CHANGED
@@ -4,18 +4,27 @@ import boto3
|
|
4 |
import tempfile
|
5 |
import os
|
6 |
from tools.helper_functions import get_or_create_env_var
|
|
|
7 |
|
8 |
PandasDataFrame = Type[pd.DataFrame]
|
9 |
|
10 |
# Get AWS credentials
|
11 |
bucket_name=""
|
12 |
|
13 |
-
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "
|
14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
15 |
|
16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
17 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
|
20 |
if AWS_ACCESS_KEY:
|
21 |
print(f'AWS_ACCESS_KEY found in environment variables')
|
|
|
4 |
import tempfile
|
5 |
import os
|
6 |
from tools.helper_functions import get_or_create_env_var
|
7 |
+
from dotenv import load_dotenv
|
8 |
|
9 |
PandasDataFrame = Type[pd.DataFrame]
|
10 |
|
11 |
# Get AWS credentials
|
12 |
bucket_name=""
|
13 |
|
14 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
|
15 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
16 |
|
17 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
18 |
print(f'The value of AWS_REGION is {AWS_REGION}')
|
19 |
|
20 |
+
# If you have an aws_config env file in the config folder, you can load in AWS keys this way
|
21 |
+
AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '/env/aws_config.env')
|
22 |
+
print(f'The value of AWS_CONFIG_PATH is {AWS_CONFIG_PATH}')
|
23 |
+
|
24 |
+
if os.path.exists(AWS_CONFIG_PATH):
|
25 |
+
print("Loading AWS keys from config folder")
|
26 |
+
load_dotenv(AWS_CONFIG_PATH)
|
27 |
+
|
28 |
AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
|
29 |
if AWS_ACCESS_KEY:
|
30 |
print(f'AWS_ACCESS_KEY found in environment variables')
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -515,6 +515,7 @@ def do_aws_comprehend_call(current_batch, current_batch_mapping, comprehend_clie
|
|
515 |
|
516 |
except Exception as e:
|
517 |
if attempt == max_retries - 1:
|
|
|
518 |
raise
|
519 |
time.sleep(retry_delay)
|
520 |
|
@@ -571,7 +572,6 @@ def run_page_text_redaction(
|
|
571 |
allow_list=allow_list
|
572 |
)
|
573 |
|
574 |
-
#print("page_analyser_result:", page_analyser_result)
|
575 |
|
576 |
all_text_line_results = map_back_entity_results(
|
577 |
page_analyser_result,
|
@@ -579,10 +579,8 @@ def run_page_text_redaction(
|
|
579 |
all_text_line_results
|
580 |
)
|
581 |
|
582 |
-
#print("all_text_line_results:", all_text_line_results)
|
583 |
|
584 |
elif pii_identification_method == "AWS Comprehend":
|
585 |
-
#print("page text:", page_text)
|
586 |
|
587 |
# Process custom entities if any
|
588 |
if custom_entities:
|
@@ -600,8 +598,6 @@ def run_page_text_redaction(
|
|
600 |
allow_list=allow_list
|
601 |
)
|
602 |
|
603 |
-
print("page_analyser_result:", page_analyser_result)
|
604 |
-
|
605 |
all_text_line_results = map_back_entity_results(
|
606 |
page_analyser_result,
|
607 |
page_text_mapping,
|
|
|
515 |
|
516 |
except Exception as e:
|
517 |
if attempt == max_retries - 1:
|
518 |
+
print("AWS Comprehend calls failed due to", e)
|
519 |
raise
|
520 |
time.sleep(retry_delay)
|
521 |
|
|
|
572 |
allow_list=allow_list
|
573 |
)
|
574 |
|
|
|
575 |
|
576 |
all_text_line_results = map_back_entity_results(
|
577 |
page_analyser_result,
|
|
|
579 |
all_text_line_results
|
580 |
)
|
581 |
|
|
|
582 |
|
583 |
elif pii_identification_method == "AWS Comprehend":
|
|
|
584 |
|
585 |
# Process custom entities if any
|
586 |
if custom_entities:
|
|
|
598 |
allow_list=allow_list
|
599 |
)
|
600 |
|
|
|
|
|
601 |
all_text_line_results = map_back_entity_results(
|
602 |
page_analyser_result,
|
603 |
page_text_mapping,
|
tools/file_conversion.py
CHANGED
@@ -464,12 +464,10 @@ def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colou
|
|
464 |
def prepare_image_or_pdf(
|
465 |
file_paths: List[str],
|
466 |
in_redact_method: str,
|
467 |
-
in_allow_list: Optional[List[List[str]]] = None,
|
468 |
latest_file_completed: int = 0,
|
469 |
out_message: List[str] = [],
|
470 |
first_loop_state: bool = False,
|
471 |
number_of_pages:int = 1,
|
472 |
-
current_loop_page_number:int=0,
|
473 |
all_annotations_object:List = [],
|
474 |
prepare_for_review:bool = False,
|
475 |
in_fully_redacted_list:List[int]=[],
|
@@ -484,12 +482,10 @@ def prepare_image_or_pdf(
|
|
484 |
Args:
|
485 |
file_paths (List[str]): List of file paths to process.
|
486 |
in_redact_method (str): The redaction method to use.
|
487 |
-
in_allow_list (optional, Optional[List[List[str]]]): List of allowed terms for redaction.
|
488 |
latest_file_completed (optional, int): Index of the last completed file.
|
489 |
out_message (optional, List[str]): List to store output messages.
|
490 |
first_loop_state (optional, bool): Flag indicating if this is the first iteration.
|
491 |
number_of_pages (optional, int): integer indicating the number of pages in the document
|
492 |
-
current_loop_page_number (optional, int): Current number of loop
|
493 |
all_annotations_object(optional, List of annotation objects): All annotations for current document
|
494 |
prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
495 |
in_fully_redacted_list(optional, List of int): A list of pages to fully redact
|
|
|
464 |
def prepare_image_or_pdf(
|
465 |
file_paths: List[str],
|
466 |
in_redact_method: str,
|
|
|
467 |
latest_file_completed: int = 0,
|
468 |
out_message: List[str] = [],
|
469 |
first_loop_state: bool = False,
|
470 |
number_of_pages:int = 1,
|
|
|
471 |
all_annotations_object:List = [],
|
472 |
prepare_for_review:bool = False,
|
473 |
in_fully_redacted_list:List[int]=[],
|
|
|
482 |
Args:
|
483 |
file_paths (List[str]): List of file paths to process.
|
484 |
in_redact_method (str): The redaction method to use.
|
|
|
485 |
latest_file_completed (optional, int): Index of the last completed file.
|
486 |
out_message (optional, List[str]): List to store output messages.
|
487 |
first_loop_state (optional, bool): Flag indicating if this is the first iteration.
|
488 |
number_of_pages (optional, int): integer indicating the number of pages in the document
|
|
|
489 |
all_annotations_object(optional, List of annotation objects): All annotations for current document
|
490 |
prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
|
491 |
in_fully_redacted_list(optional, List of int): A list of pages to fully redact
|
tools/file_redaction.py
CHANGED
@@ -29,7 +29,7 @@ from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRRes
|
|
29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
31 |
from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
32 |
-
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
34 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
35 |
|
@@ -78,9 +78,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
78 |
custom_recogniser_word_list:List[str]=None,
|
79 |
redact_whole_page_list:List[str]=None,
|
80 |
latest_file_completed:int=0,
|
81 |
-
out_message:
|
82 |
-
out_file_paths:
|
83 |
-
log_files_output_paths:
|
84 |
first_loop_state:bool=False,
|
85 |
page_min:int=0,
|
86 |
page_max:int=999,
|
@@ -99,6 +99,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
99 |
match_fuzzy_whole_phrase_bool:bool=True,
|
100 |
aws_access_key_textbox:str='',
|
101 |
aws_secret_key_textbox:str='',
|
|
|
|
|
102 |
output_folder:str=output_folder,
|
103 |
progress=gr.Progress(track_tqdm=True)):
|
104 |
'''
|
@@ -136,6 +138,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
136 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
137 |
- aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
|
138 |
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
|
|
139 |
- output_folder (str, optional): Output folder for results.
|
140 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
141 |
|
@@ -145,6 +148,13 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
145 |
tic = time.perf_counter()
|
146 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
148 |
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
149 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
150 |
|
@@ -212,7 +222,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
212 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
213 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
214 |
|
215 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
216 |
|
217 |
# If we have reached the last page, return message
|
218 |
if current_loop_page >= number_of_pages:
|
@@ -228,7 +238,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
228 |
|
229 |
review_out_file_paths.extend(out_review_file_path)
|
230 |
|
231 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
232 |
|
233 |
# Create allow list
|
234 |
# If string, assume file path
|
@@ -241,45 +251,52 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
241 |
else:
|
242 |
in_allow_list_flat = []
|
243 |
|
244 |
-
|
245 |
-
# Try to connect to AWS services only if RUN_AWS_FUNCTIONS environmental variable is 1
|
246 |
if pii_identification_method == "AWS Comprehend":
|
247 |
print("Trying to connect to AWS Comprehend service")
|
248 |
-
if
|
249 |
-
|
250 |
-
|
|
|
251 |
comprehend_client = boto3.client('comprehend',
|
252 |
aws_access_key_id=aws_access_key_textbox,
|
253 |
aws_secret_access_key=aws_secret_key_textbox)
|
|
|
|
|
|
|
254 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
|
|
255 |
comprehend_client = boto3.client('comprehend',
|
256 |
aws_access_key_id=AWS_ACCESS_KEY,
|
257 |
-
aws_secret_access_key=AWS_SECRET_KEY)
|
258 |
else:
|
259 |
comprehend_client = ""
|
260 |
-
out_message = "Cannot connect to AWS Comprehend service. Please choose another PII identification method."
|
261 |
print(out_message)
|
262 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
263 |
else:
|
264 |
comprehend_client = ""
|
265 |
|
266 |
if in_redact_method == textract_option:
|
267 |
-
print("Trying to connect to AWS Textract service")
|
268 |
-
if
|
269 |
-
|
270 |
-
|
271 |
-
comprehend_client = boto3.client('textract',
|
272 |
aws_access_key_id=aws_access_key_textbox,
|
273 |
aws_secret_access_key=aws_secret_key_textbox)
|
|
|
|
|
|
|
274 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
275 |
-
|
|
|
276 |
aws_access_key_id=AWS_ACCESS_KEY,
|
277 |
-
aws_secret_access_key=AWS_SECRET_KEY)
|
278 |
else:
|
279 |
textract_client = ""
|
280 |
-
out_message = "Cannot connect to AWS Textract. Please choose another text extraction method."
|
281 |
print(out_message)
|
282 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
283 |
else:
|
284 |
textract_client = ""
|
285 |
|
@@ -301,9 +318,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
301 |
file_paths_list = file_paths
|
302 |
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
303 |
|
304 |
-
# print("file_paths_list in choose_redactor function:", file_paths_list)
|
305 |
-
|
306 |
-
|
307 |
for file in file_paths_loop:
|
308 |
if isinstance(file, str):
|
309 |
file_path = file
|
@@ -313,7 +327,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
313 |
if file_path:
|
314 |
pdf_file_name_without_ext = get_file_name_without_type(file_path)
|
315 |
pdf_file_name_with_ext = os.path.basename(file_path)
|
316 |
-
# print("Redacting file:", pdf_file_name_with_ext)
|
317 |
|
318 |
is_a_pdf = is_pdf(file_path) == True
|
319 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
@@ -324,14 +337,14 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
324 |
out_message = "No file selected"
|
325 |
print(out_message)
|
326 |
|
327 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
328 |
|
329 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
330 |
|
331 |
#Analyse and redact image-based pdf or image
|
332 |
if is_pdf_or_image(file_path) == False:
|
333 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
334 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
335 |
|
336 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
337 |
|
@@ -361,14 +374,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
361 |
custom_recogniser_word_list,
|
362 |
redact_whole_page_list,
|
363 |
max_fuzzy_spelling_mistakes_num,
|
364 |
-
match_fuzzy_whole_phrase_bool
|
365 |
-
|
366 |
-
|
367 |
-
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
368 |
-
|
369 |
# Save Textract request metadata (if exists)
|
370 |
if new_request_metadata:
|
371 |
-
#print("Request metadata:", new_request_metadata)
|
372 |
all_request_metadata.append(new_request_metadata)
|
373 |
|
374 |
elif in_redact_method == text_ocr_option:
|
@@ -377,7 +387,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
377 |
|
378 |
if is_pdf(file_path) == False:
|
379 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
380 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
381 |
|
382 |
# Analyse text-based pdf
|
383 |
print('Redacting file as text-based PDF')
|
@@ -407,7 +417,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
407 |
else:
|
408 |
out_message = "No redaction method selected"
|
409 |
print(out_message)
|
410 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
411 |
|
412 |
# If at last page, save to file
|
413 |
if current_loop_page >= number_of_pages:
|
@@ -422,9 +432,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
422 |
# Save file
|
423 |
if is_pdf(file_path) == False:
|
424 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
|
425 |
-
#pymupdf_doc[0].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)
|
426 |
-
#print("pymupdf_doc", pymupdf_doc)
|
427 |
-
#print("pymupdf_doc[0]", pymupdf_doc[0])
|
428 |
pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
429 |
out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
|
430 |
|
@@ -434,10 +441,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
434 |
|
435 |
out_file_paths.append(out_redacted_pdf_file_path)
|
436 |
|
437 |
-
#if log_files_output_paths:
|
438 |
-
# log_files_output_paths.extend(log_files_output_paths)
|
439 |
-
|
440 |
-
|
441 |
out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
442 |
|
443 |
logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
@@ -450,27 +453,20 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
450 |
|
451 |
# Save the gradio_annotation_boxes to a JSON file
|
452 |
try:
|
453 |
-
|
454 |
-
#print("Saving annotations to CSV")
|
455 |
-
|
456 |
-
# Convert json to csv and also save this
|
457 |
-
#print("annotations_all_pages:", annotations_all_pages)
|
458 |
-
#print("all_decision_process_table:", all_decision_process_table)
|
459 |
-
|
460 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
461 |
|
462 |
out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
|
463 |
review_df.to_csv(out_review_file_path, index=None)
|
464 |
out_file_paths.append(out_review_file_path)
|
465 |
|
466 |
-
print("Saved review file to csv")
|
467 |
|
468 |
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
469 |
with open(out_annotation_file_path, 'w') as f:
|
470 |
json.dump(annotations_all_pages, f)
|
471 |
log_files_output_paths.append(out_annotation_file_path)
|
472 |
|
473 |
-
print("Saving annotations to JSON")
|
474 |
|
475 |
except Exception as e:
|
476 |
print("Could not save annotations to json or csv file:", e)
|
@@ -488,7 +484,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
488 |
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
489 |
|
490 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
491 |
-
#print("Estimated total processing time:", str(estimate_total_processing_time))
|
492 |
|
493 |
else:
|
494 |
toc = time.perf_counter()
|
@@ -511,19 +506,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
511 |
|
512 |
if combined_out_message: out_message = combined_out_message
|
513 |
|
514 |
-
#print("\nout_message at choose_and_run_redactor end is:", out_message)
|
515 |
-
|
516 |
# Ensure no duplicated output files
|
517 |
log_files_output_paths = list(set(log_files_output_paths))
|
518 |
out_file_paths = list(set(out_file_paths))
|
519 |
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
520 |
|
521 |
-
|
522 |
-
#print("out_file_paths:", out_file_paths)
|
523 |
-
#print("review_out_file_paths:", review_out_file_paths)
|
524 |
-
|
525 |
-
|
526 |
-
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths
|
527 |
|
528 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
529 |
'''
|
@@ -646,9 +634,6 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot, image:Image, type="imag
|
|
646 |
# Unpack coordinates
|
647 |
x1, y1, x2, y2 = rect_coordinates
|
648 |
|
649 |
-
#print("scale_width:", scale_width)
|
650 |
-
#print("scale_height:", scale_height)
|
651 |
-
|
652 |
x1 = (x1* scale_width)# + page_x_adjust
|
653 |
new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
|
654 |
x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
|
@@ -1005,12 +990,10 @@ def redact_image_pdf(file_path:str,
|
|
1005 |
if custom_recogniser_word_list:
|
1006 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1007 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
1008 |
-
#print("new_custom_recogniser:", new_custom_recogniser)
|
1009 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
1010 |
|
1011 |
nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
|
1012 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1013 |
-
#print("new_custom_recogniser:", new_custom_recogniser)
|
1014 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1015 |
|
1016 |
|
@@ -1045,22 +1028,15 @@ def redact_image_pdf(file_path:str,
|
|
1045 |
else: page_min = page_min - 1
|
1046 |
|
1047 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
1048 |
-
#print("Current_loop_page:", current_loop_page)
|
1049 |
|
1050 |
# If running Textract, check if file already exists. If it does, load in existing data
|
1051 |
-
# Import results from json and convert
|
1052 |
if analysis_type == textract_option:
|
1053 |
|
1054 |
json_file_path = output_folder + file_name + "_textract.json"
|
1055 |
|
1056 |
-
|
1057 |
if not os.path.exists(json_file_path):
|
1058 |
print("No existing Textract results file found.")
|
1059 |
textract_data = {}
|
1060 |
-
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1061 |
-
#log_files_output_paths.append(json_file_path)
|
1062 |
-
#request_metadata = request_metadata + "\n" + new_request_metadata
|
1063 |
-
#wrapped_text_blocks = {"pages":[text_blocks]}
|
1064 |
else:
|
1065 |
# Open the file and load the JSON data
|
1066 |
no_textract_file = False
|
@@ -1073,7 +1049,6 @@ def redact_image_pdf(file_path:str,
|
|
1073 |
textract_data = json.load(json_file)
|
1074 |
|
1075 |
###
|
1076 |
-
|
1077 |
if current_loop_page == 0: page_loop_start = 0
|
1078 |
else: page_loop_start = current_loop_page
|
1079 |
|
@@ -1087,7 +1062,6 @@ def redact_image_pdf(file_path:str,
|
|
1087 |
page_break_return = False
|
1088 |
|
1089 |
reported_page_number = str(page_no + 1)
|
1090 |
-
#print("Redacting page:", reported_page_number)
|
1091 |
|
1092 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1093 |
try:
|
@@ -1104,7 +1078,6 @@ def redact_image_pdf(file_path:str,
|
|
1104 |
|
1105 |
#print("Image is in range of pages to redact")
|
1106 |
if isinstance(image, str):
|
1107 |
-
#print("image is a file path", image)
|
1108 |
image = Image.open(image)
|
1109 |
|
1110 |
# Need image size to convert textract OCR outputs to the correct sizes
|
@@ -1153,7 +1126,7 @@ def redact_image_pdf(file_path:str,
|
|
1153 |
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1154 |
except Exception as e:
|
1155 |
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
1156 |
-
|
1157 |
new_request_metadata = "Failed Textract API call"
|
1158 |
|
1159 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
@@ -1192,13 +1165,13 @@ def redact_image_pdf(file_path:str,
|
|
1192 |
redaction_bboxes = []
|
1193 |
|
1194 |
|
1195 |
-
if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
1196 |
-
elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
1197 |
|
1198 |
-
# Save decision making process
|
1199 |
-
bboxes_str = str(redaction_bboxes)
|
1200 |
-
with open(interim_results_file_path, "w") as f:
|
1201 |
-
|
1202 |
|
1203 |
# Merge close bounding boxes
|
1204 |
merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
|
@@ -1210,7 +1183,6 @@ def redact_image_pdf(file_path:str,
|
|
1210 |
all_image_annotations_boxes = []
|
1211 |
|
1212 |
for box in merged_redaction_bboxes:
|
1213 |
-
#print("box:", box)
|
1214 |
|
1215 |
x0 = box.left
|
1216 |
y0 = box.top
|
@@ -1238,8 +1210,6 @@ def redact_image_pdf(file_path:str,
|
|
1238 |
|
1239 |
## Apply annotations with pymupdf
|
1240 |
else:
|
1241 |
-
#print("merged_redaction_boxes:", merged_redaction_bboxes)
|
1242 |
-
#print("redact_whole_page_list:", redact_whole_page_list)
|
1243 |
if redact_whole_page_list:
|
1244 |
int_reported_page_number = int(reported_page_number)
|
1245 |
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
@@ -1284,8 +1254,6 @@ def redact_image_pdf(file_path:str,
|
|
1284 |
|
1285 |
time_taken = toc - tic
|
1286 |
|
1287 |
-
#print("toc - tic:", time_taken)
|
1288 |
-
|
1289 |
# Break if time taken is greater than max_time seconds
|
1290 |
if time_taken > max_time:
|
1291 |
print("Processing for", max_time, "seconds, breaking loop.")
|
@@ -1298,7 +1266,6 @@ def redact_image_pdf(file_path:str,
|
|
1298 |
pymupdf_doc = images
|
1299 |
|
1300 |
# Check if the image already exists in annotations_all_pages
|
1301 |
-
#print("annotations_all_pages:", annotations_all_pages)
|
1302 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1303 |
if existing_index is not None:
|
1304 |
# Replace the existing annotation
|
@@ -1315,6 +1282,8 @@ def redact_image_pdf(file_path:str,
|
|
1315 |
if json_file_path not in log_files_output_paths:
|
1316 |
log_files_output_paths.append(json_file_path)
|
1317 |
|
|
|
|
|
1318 |
current_loop_page += 1
|
1319 |
|
1320 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
@@ -1324,7 +1293,6 @@ def redact_image_pdf(file_path:str,
|
|
1324 |
pymupdf_doc = images
|
1325 |
|
1326 |
# Check if the image already exists in annotations_all_pages
|
1327 |
-
#print("annotations_all_pages:", annotations_all_pages)
|
1328 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1329 |
if existing_index is not None:
|
1330 |
# Replace the existing annotation
|
@@ -1409,9 +1377,6 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1409 |
|
1410 |
if isinstance(char, LTAnno):
|
1411 |
|
1412 |
-
# print("Character line:", "".join(character_text_objects_out))
|
1413 |
-
# print("Char is an annotation object:", char)
|
1414 |
-
|
1415 |
added_text = char.get_text()
|
1416 |
|
1417 |
# Handle double quotes
|
@@ -1427,7 +1392,7 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1427 |
|
1428 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1429 |
if '\n' in added_text:
|
1430 |
-
|
1431 |
# Finalize the current line
|
1432 |
if current_word:
|
1433 |
word_bboxes.append((current_word, current_word_bbox))
|
@@ -1475,13 +1440,12 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1475 |
word_bboxes.append((current_word, current_word_bbox))
|
1476 |
|
1477 |
if full_text:
|
1478 |
-
#print("full_text before:", full_text)
|
1479 |
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
1480 |
# Convert special characters to a human-readable format
|
1481 |
-
|
1482 |
full_text = clean_unicode_text(full_text)
|
1483 |
full_text = full_text.strip()
|
1484 |
-
|
1485 |
|
1486 |
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
1487 |
|
@@ -1498,9 +1462,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1498 |
analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
|
1499 |
|
1500 |
# Remove brackets and split the string into four separate columns
|
1501 |
-
#print("analysed_bounding_boxes_df_new:", analysed_bounding_boxes_df_new['boundingBox'])
|
1502 |
-
# analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].str.strip('[]').str.split(',', expand=True)
|
1503 |
-
|
1504 |
# Split the boundingBox list into four separate columns
|
1505 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1506 |
|
@@ -1512,8 +1473,6 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1512 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
1513 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
1514 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
1515 |
-
|
1516 |
-
#print('\n\ndecision_process_table:\n\n', decision_process_table)
|
1517 |
|
1518 |
return decision_process_table
|
1519 |
|
@@ -1607,7 +1566,6 @@ def redact_text_pdf(
|
|
1607 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1608 |
|
1609 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
1610 |
-
#print("custom_recogniser_word_list:", custom_recogniser_word_list)
|
1611 |
if custom_recogniser_word_list:
|
1612 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1613 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
@@ -1617,16 +1575,6 @@ def redact_text_pdf(
|
|
1617 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1618 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1619 |
|
1620 |
-
# List all elements currently in the nlp_analyser registry
|
1621 |
-
#print("Current recognizers in nlp_analyser registry:")
|
1622 |
-
#for recognizer_name in nlp_analyser.registry.recognizers:
|
1623 |
-
#print(recognizer_name)
|
1624 |
-
#print(recognizer_name.name)
|
1625 |
-
|
1626 |
-
#print("Custom recogniser:", nlp_analyser.registry)
|
1627 |
-
|
1628 |
-
#print("custom_recogniser_word_list:", custom_recogniser_word_list)
|
1629 |
-
|
1630 |
tic = time.perf_counter()
|
1631 |
|
1632 |
# Open with Pikepdf to get text lines
|
@@ -1641,7 +1589,6 @@ def redact_text_pdf(
|
|
1641 |
else: page_min = page_min - 1
|
1642 |
|
1643 |
print("Page range is",str(page_min + 1), "to", str(page_max))
|
1644 |
-
print("Current_loop_page:", current_loop_page)
|
1645 |
|
1646 |
if current_loop_page == 0: page_loop_start = 0
|
1647 |
else: page_loop_start = current_loop_page
|
@@ -1716,8 +1663,6 @@ def redact_text_pdf(
|
|
1716 |
### REDACTION
|
1717 |
|
1718 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
1719 |
-
#print("Identifying redactions on page.")
|
1720 |
-
|
1721 |
page_analysed_bounding_boxes = run_page_text_redaction(
|
1722 |
language,
|
1723 |
chosen_redact_entities,
|
@@ -1735,24 +1680,18 @@ def redact_text_pdf(
|
|
1735 |
comprehend_query_number
|
1736 |
)
|
1737 |
|
1738 |
-
|
1739 |
-
#print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
1740 |
-
#print("image:", image)
|
1741 |
else:
|
1742 |
page_analysed_bounding_boxes = []
|
1743 |
|
1744 |
|
1745 |
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1746 |
|
1747 |
-
#print("page_analysed_bounding_boxes_out_converted:", page_analysed_bounding_boxes)
|
1748 |
|
1749 |
# Annotate redactions on page
|
1750 |
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1751 |
|
1752 |
-
# print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
|
1753 |
-
|
1754 |
# Make pymupdf page redactions
|
1755 |
-
#print("redact_whole_page_list:", redact_whole_page_list)
|
1756 |
if redact_whole_page_list:
|
1757 |
int_reported_page_number = int(reported_page_number)
|
1758 |
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
@@ -1761,9 +1700,6 @@ def redact_text_pdf(
|
|
1761 |
|
1762 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
|
1763 |
|
1764 |
-
#print("image_annotations:", image_annotations)
|
1765 |
-
|
1766 |
-
#print("Did redact_page_with_pymupdf function")
|
1767 |
reported_page_no = page_no + 1
|
1768 |
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1769 |
|
@@ -1778,14 +1714,12 @@ def redact_text_pdf(
|
|
1778 |
|
1779 |
if not decision_process_table_on_page.empty:
|
1780 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
1781 |
-
|
1782 |
|
1783 |
toc = time.perf_counter()
|
1784 |
|
1785 |
time_taken = toc - tic
|
1786 |
|
1787 |
-
#print("toc - tic:", time_taken)
|
1788 |
-
|
1789 |
# Break if time taken is greater than max_time seconds
|
1790 |
if time_taken > max_time:
|
1791 |
print("Processing for", max_time, "seconds, breaking.")
|
|
|
29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
31 |
from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
32 |
+
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
|
33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
34 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
35 |
|
|
|
78 |
custom_recogniser_word_list:List[str]=None,
|
79 |
redact_whole_page_list:List[str]=None,
|
80 |
latest_file_completed:int=0,
|
81 |
+
out_message:List=[],
|
82 |
+
out_file_paths:List=[],
|
83 |
+
log_files_output_paths:List=[],
|
84 |
first_loop_state:bool=False,
|
85 |
page_min:int=0,
|
86 |
page_max:int=999,
|
|
|
99 |
match_fuzzy_whole_phrase_bool:bool=True,
|
100 |
aws_access_key_textbox:str='',
|
101 |
aws_secret_key_textbox:str='',
|
102 |
+
annotate_max_pages:int=1,
|
103 |
+
review_file_state=[],
|
104 |
output_folder:str=output_folder,
|
105 |
progress=gr.Progress(track_tqdm=True)):
|
106 |
'''
|
|
|
138 |
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
139 |
- aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
|
140 |
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
141 |
+
- annotate_max_pages (int, optional): Maximum page value for the annotation object
|
142 |
- output_folder (str, optional): Output folder for results.
|
143 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
144 |
|
|
|
148 |
tic = time.perf_counter()
|
149 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
150 |
|
151 |
+
# If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
|
152 |
+
if not pymupdf_doc:
|
153 |
+
print("Prepared PDF file not found, running prepare_image_or_pdf function")
|
154 |
+
out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages)
|
155 |
+
|
156 |
+
annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
157 |
+
|
158 |
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
159 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
160 |
|
|
|
222 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
223 |
print("Estimated total processing time:", str(estimate_total_processing_time))
|
224 |
|
225 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
226 |
|
227 |
# If we have reached the last page, return message
|
228 |
if current_loop_page >= number_of_pages:
|
|
|
238 |
|
239 |
review_out_file_paths.extend(out_review_file_path)
|
240 |
|
241 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
242 |
|
243 |
# Create allow list
|
244 |
# If string, assume file path
|
|
|
251 |
else:
|
252 |
in_allow_list_flat = []
|
253 |
|
254 |
+
# Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
|
|
|
255 |
if pii_identification_method == "AWS Comprehend":
|
256 |
print("Trying to connect to AWS Comprehend service")
|
257 |
+
if aws_access_key_textbox and aws_secret_key_textbox:
|
258 |
+
print("Connecting to Comprehend using AWS access key and secret keys from textboxes.")
|
259 |
+
print("aws_access_key_textbox:", aws_access_key_textbox)
|
260 |
+
print("aws_secret_access_key:", aws_secret_key_textbox)
|
261 |
comprehend_client = boto3.client('comprehend',
|
262 |
aws_access_key_id=aws_access_key_textbox,
|
263 |
aws_secret_access_key=aws_secret_key_textbox)
|
264 |
+
elif RUN_AWS_FUNCTIONS == "1":
|
265 |
+
print("Connecting to Comprehend via existing SSO connection")
|
266 |
+
comprehend_client = boto3.client('comprehend')
|
267 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
268 |
+
print("Getting Comprehend credentials from environment variables")
|
269 |
comprehend_client = boto3.client('comprehend',
|
270 |
aws_access_key_id=AWS_ACCESS_KEY,
|
271 |
+
aws_secret_access_key=AWS_SECRET_KEY)
|
272 |
else:
|
273 |
comprehend_client = ""
|
274 |
+
out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
|
275 |
print(out_message)
|
276 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
277 |
else:
|
278 |
comprehend_client = ""
|
279 |
|
280 |
if in_redact_method == textract_option:
|
281 |
+
print("Trying to connect to AWS Textract service")
|
282 |
+
if aws_access_key_textbox and aws_secret_key_textbox:
|
283 |
+
print("Connecting to Textract using AWS access key and secret keys from textboxes.")
|
284 |
+
textract_client = boto3.client('textract',
|
|
|
285 |
aws_access_key_id=aws_access_key_textbox,
|
286 |
aws_secret_access_key=aws_secret_key_textbox)
|
287 |
+
elif RUN_AWS_FUNCTIONS == "1":
|
288 |
+
print("Connecting to Textract via existing SSO connection")
|
289 |
+
textract_client = boto3.client('textract')
|
290 |
elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
|
291 |
+
print("Getting Textract credentials from environment variables.")
|
292 |
+
textract_client = boto3.client('textract',
|
293 |
aws_access_key_id=AWS_ACCESS_KEY,
|
294 |
+
aws_secret_access_key=AWS_SECRET_KEY)
|
295 |
else:
|
296 |
textract_client = ""
|
297 |
+
out_message = "Cannot connect to AWS Textract. Please provide access keys under Textract settings on the Redaction settings tab,choose another text extraction method."
|
298 |
print(out_message)
|
299 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
300 |
else:
|
301 |
textract_client = ""
|
302 |
|
|
|
318 |
file_paths_list = file_paths
|
319 |
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
320 |
|
|
|
|
|
|
|
321 |
for file in file_paths_loop:
|
322 |
if isinstance(file, str):
|
323 |
file_path = file
|
|
|
327 |
if file_path:
|
328 |
pdf_file_name_without_ext = get_file_name_without_type(file_path)
|
329 |
pdf_file_name_with_ext = os.path.basename(file_path)
|
|
|
330 |
|
331 |
is_a_pdf = is_pdf(file_path) == True
|
332 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
|
|
337 |
out_message = "No file selected"
|
338 |
print(out_message)
|
339 |
|
340 |
+
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
341 |
|
342 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
343 |
|
344 |
#Analyse and redact image-based pdf or image
|
345 |
if is_pdf_or_image(file_path) == False:
|
346 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
347 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
348 |
|
349 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
350 |
|
|
|
374 |
custom_recogniser_word_list,
|
375 |
redact_whole_page_list,
|
376 |
max_fuzzy_spelling_mistakes_num,
|
377 |
+
match_fuzzy_whole_phrase_bool,
|
378 |
+
log_files_output_paths=log_files_output_paths)
|
379 |
+
|
|
|
|
|
380 |
# Save Textract request metadata (if exists)
|
381 |
if new_request_metadata:
|
|
|
382 |
all_request_metadata.append(new_request_metadata)
|
383 |
|
384 |
elif in_redact_method == text_ocr_option:
|
|
|
387 |
|
388 |
if is_pdf(file_path) == False:
|
389 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
390 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
391 |
|
392 |
# Analyse text-based pdf
|
393 |
print('Redacting file as text-based PDF')
|
|
|
417 |
else:
|
418 |
out_message = "No redaction method selected"
|
419 |
print(out_message)
|
420 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
421 |
|
422 |
# If at last page, save to file
|
423 |
if current_loop_page >= number_of_pages:
|
|
|
432 |
# Save file
|
433 |
if is_pdf(file_path) == False:
|
434 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
|
|
|
|
|
|
|
435 |
pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
436 |
out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
|
437 |
|
|
|
441 |
|
442 |
out_file_paths.append(out_redacted_pdf_file_path)
|
443 |
|
|
|
|
|
|
|
|
|
444 |
out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
445 |
|
446 |
logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
|
|
|
453 |
|
454 |
# Save the gradio_annotation_boxes to a JSON file
|
455 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
457 |
|
458 |
out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
|
459 |
review_df.to_csv(out_review_file_path, index=None)
|
460 |
out_file_paths.append(out_review_file_path)
|
461 |
|
462 |
+
#print("Saved review file to csv")
|
463 |
|
464 |
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
465 |
with open(out_annotation_file_path, 'w') as f:
|
466 |
json.dump(annotations_all_pages, f)
|
467 |
log_files_output_paths.append(out_annotation_file_path)
|
468 |
|
469 |
+
#print("Saving annotations to JSON")
|
470 |
|
471 |
except Exception as e:
|
472 |
print("Could not save annotations to json or csv file:", e)
|
|
|
484 |
combined_out_message = combined_out_message + " " + out_time_message # Ensure this is a single string
|
485 |
|
486 |
estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
|
|
|
487 |
|
488 |
else:
|
489 |
toc = time.perf_counter()
|
|
|
506 |
|
507 |
if combined_out_message: out_message = combined_out_message
|
508 |
|
|
|
|
|
509 |
# Ensure no duplicated output files
|
510 |
log_files_output_paths = list(set(log_files_output_paths))
|
511 |
out_file_paths = list(set(out_file_paths))
|
512 |
review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
|
513 |
|
514 |
+
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
|
|
|
|
|
|
|
|
|
|
515 |
|
516 |
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
517 |
'''
|
|
|
634 |
# Unpack coordinates
|
635 |
x1, y1, x2, y2 = rect_coordinates
|
636 |
|
|
|
|
|
|
|
637 |
x1 = (x1* scale_width)# + page_x_adjust
|
638 |
new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
|
639 |
x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
|
|
|
990 |
if custom_recogniser_word_list:
|
991 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
992 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
|
|
993 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
994 |
|
995 |
nlp_analyser.registry.remove_recognizer("CustomWordFuzzyRecognizer")
|
996 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
|
|
997 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
998 |
|
999 |
|
|
|
1028 |
else: page_min = page_min - 1
|
1029 |
|
1030 |
print("Page range:", str(page_min + 1), "to", str(page_max))
|
|
|
1031 |
|
1032 |
# If running Textract, check if file already exists. If it does, load in existing data
|
|
|
1033 |
if analysis_type == textract_option:
|
1034 |
|
1035 |
json_file_path = output_folder + file_name + "_textract.json"
|
1036 |
|
|
|
1037 |
if not os.path.exists(json_file_path):
|
1038 |
print("No existing Textract results file found.")
|
1039 |
textract_data = {}
|
|
|
|
|
|
|
|
|
1040 |
else:
|
1041 |
# Open the file and load the JSON data
|
1042 |
no_textract_file = False
|
|
|
1049 |
textract_data = json.load(json_file)
|
1050 |
|
1051 |
###
|
|
|
1052 |
if current_loop_page == 0: page_loop_start = 0
|
1053 |
else: page_loop_start = current_loop_page
|
1054 |
|
|
|
1062 |
page_break_return = False
|
1063 |
|
1064 |
reported_page_number = str(page_no + 1)
|
|
|
1065 |
|
1066 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1067 |
try:
|
|
|
1078 |
|
1079 |
#print("Image is in range of pages to redact")
|
1080 |
if isinstance(image, str):
|
|
|
1081 |
image = Image.open(image)
|
1082 |
|
1083 |
# Need image size to convert textract OCR outputs to the correct sizes
|
|
|
1126 |
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1127 |
except Exception as e:
|
1128 |
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
1129 |
+
text_blocks = []
|
1130 |
new_request_metadata = "Failed Textract API call"
|
1131 |
|
1132 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
|
|
1165 |
redaction_bboxes = []
|
1166 |
|
1167 |
|
1168 |
+
# if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
|
1169 |
+
# elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
|
1170 |
|
1171 |
+
# # Save decision making process
|
1172 |
+
# bboxes_str = str(redaction_bboxes)
|
1173 |
+
# with open(interim_results_file_path, "w") as f:
|
1174 |
+
# f.write(bboxes_str)
|
1175 |
|
1176 |
# Merge close bounding boxes
|
1177 |
merged_redaction_bboxes = merge_img_bboxes(redaction_bboxes, line_level_ocr_results_with_children, signature_recogniser_results, handwriting_recogniser_results, handwrite_signature_checkbox)
|
|
|
1183 |
all_image_annotations_boxes = []
|
1184 |
|
1185 |
for box in merged_redaction_bboxes:
|
|
|
1186 |
|
1187 |
x0 = box.left
|
1188 |
y0 = box.top
|
|
|
1210 |
|
1211 |
## Apply annotations with pymupdf
|
1212 |
else:
|
|
|
|
|
1213 |
if redact_whole_page_list:
|
1214 |
int_reported_page_number = int(reported_page_number)
|
1215 |
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
|
|
1254 |
|
1255 |
time_taken = toc - tic
|
1256 |
|
|
|
|
|
1257 |
# Break if time taken is greater than max_time seconds
|
1258 |
if time_taken > max_time:
|
1259 |
print("Processing for", max_time, "seconds, breaking loop.")
|
|
|
1266 |
pymupdf_doc = images
|
1267 |
|
1268 |
# Check if the image already exists in annotations_all_pages
|
|
|
1269 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1270 |
if existing_index is not None:
|
1271 |
# Replace the existing annotation
|
|
|
1282 |
if json_file_path not in log_files_output_paths:
|
1283 |
log_files_output_paths.append(json_file_path)
|
1284 |
|
1285 |
+
print("At end of redact_image_pdf function where time over max.", json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
|
1286 |
+
|
1287 |
current_loop_page += 1
|
1288 |
|
1289 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
|
|
1293 |
pymupdf_doc = images
|
1294 |
|
1295 |
# Check if the image already exists in annotations_all_pages
|
|
|
1296 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
1297 |
if existing_index is not None:
|
1298 |
# Replace the existing annotation
|
|
|
1377 |
|
1378 |
if isinstance(char, LTAnno):
|
1379 |
|
|
|
|
|
|
|
1380 |
added_text = char.get_text()
|
1381 |
|
1382 |
# Handle double quotes
|
|
|
1392 |
|
1393 |
# Check for line break (assuming a new line is indicated by a specific character)
|
1394 |
if '\n' in added_text:
|
1395 |
+
|
1396 |
# Finalize the current line
|
1397 |
if current_word:
|
1398 |
word_bboxes.append((current_word, current_word_bbox))
|
|
|
1440 |
word_bboxes.append((current_word, current_word_bbox))
|
1441 |
|
1442 |
if full_text:
|
|
|
1443 |
if re.search(r'[^\x00-\x7F]', full_text): # Matches any non-ASCII character
|
1444 |
# Convert special characters to a human-readable format
|
1445 |
+
|
1446 |
full_text = clean_unicode_text(full_text)
|
1447 |
full_text = full_text.strip()
|
1448 |
+
|
1449 |
|
1450 |
line_level_results_out.append(OCRResult(full_text.strip(), round(overall_bbox[0],2), round(overall_bbox[1], 2), round(overall_bbox[2]-overall_bbox[0],2), round(overall_bbox[3]-overall_bbox[1],2)))
|
1451 |
|
|
|
1462 |
analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
|
1463 |
|
1464 |
# Remove brackets and split the string into four separate columns
|
|
|
|
|
|
|
1465 |
# Split the boundingBox list into four separate columns
|
1466 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1467 |
|
|
|
1473 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
1474 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
1475 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
|
|
|
|
1476 |
|
1477 |
return decision_process_table
|
1478 |
|
|
|
1566 |
return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
1567 |
|
1568 |
# Update custom word list analyser object with any new words that have been added to the custom deny list
|
|
|
1569 |
if custom_recogniser_word_list:
|
1570 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
1571 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
|
|
1575 |
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
1576 |
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
1577 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1578 |
tic = time.perf_counter()
|
1579 |
|
1580 |
# Open with Pikepdf to get text lines
|
|
|
1589 |
else: page_min = page_min - 1
|
1590 |
|
1591 |
print("Page range is",str(page_min + 1), "to", str(page_max))
|
|
|
1592 |
|
1593 |
if current_loop_page == 0: page_loop_start = 0
|
1594 |
else: page_loop_start = current_loop_page
|
|
|
1663 |
### REDACTION
|
1664 |
|
1665 |
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
|
|
|
|
1666 |
page_analysed_bounding_boxes = run_page_text_redaction(
|
1667 |
language,
|
1668 |
chosen_redact_entities,
|
|
|
1680 |
comprehend_query_number
|
1681 |
)
|
1682 |
|
1683 |
+
|
|
|
|
|
1684 |
else:
|
1685 |
page_analysed_bounding_boxes = []
|
1686 |
|
1687 |
|
1688 |
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1689 |
|
|
|
1690 |
|
1691 |
# Annotate redactions on page
|
1692 |
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
1693 |
|
|
|
|
|
1694 |
# Make pymupdf page redactions
|
|
|
1695 |
if redact_whole_page_list:
|
1696 |
int_reported_page_number = int(reported_page_number)
|
1697 |
if int_reported_page_number in redact_whole_page_list: redact_whole_page = True
|
|
|
1700 |
|
1701 |
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
|
1702 |
|
|
|
|
|
|
|
1703 |
reported_page_no = page_no + 1
|
1704 |
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1705 |
|
|
|
1714 |
|
1715 |
if not decision_process_table_on_page.empty:
|
1716 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
1717 |
+
|
1718 |
|
1719 |
toc = time.perf_counter()
|
1720 |
|
1721 |
time_taken = toc - tic
|
1722 |
|
|
|
|
|
1723 |
# Break if time taken is greater than max_time seconds
|
1724 |
if time_taken > max_time:
|
1725 |
print("Processing for", max_time, "seconds, breaking.")
|
tools/redaction_review.py
CHANGED
@@ -396,7 +396,7 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
396 |
row_value_page = evt.row_value[0] # This is the page number value
|
397 |
return row_value_page
|
398 |
|
399 |
-
def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
|
400 |
'''
|
401 |
Converts coordinates from image space to Adobe PDF space.
|
402 |
|
@@ -431,7 +431,7 @@ def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width,
|
|
431 |
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
432 |
|
433 |
|
434 |
-
def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
|
435 |
'''
|
436 |
Create an xfdf file from a review csv file and a pdf
|
437 |
'''
|
|
|
396 |
row_value_page = evt.row_value[0] # This is the page number value
|
397 |
return row_value_page
|
398 |
|
399 |
+
def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
|
400 |
'''
|
401 |
Converts coordinates from image space to Adobe PDF space.
|
402 |
|
|
|
431 |
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
432 |
|
433 |
|
434 |
+
def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str]):
|
435 |
'''
|
436 |
Create an xfdf file from a review csv file and a pdf
|
437 |
'''
|