Spaces:
Sleeping
Sleeping
Commit
·
a770956
1
Parent(s):
928b1e9
Enhance file handling and UI features: improved Gradio app layout with fill width option, and integrated new settings for deny, and fully redacted lists (placeholders so far). Updated file conversion functions to handle CSV inputs and added CSV review file generation for redactions. Now retains all original and merged redaction boxes.
Browse files- .dockerignore +1 -0
- .gitignore +1 -0
- Dockerfile +1 -0
- app.py +53 -35
- tools/file_conversion.py +86 -18
- tools/file_redaction.py +399 -136
- tools/helper_functions.py +12 -12
- tools/redaction_review.py +108 -73
.dockerignore
CHANGED
@@ -6,6 +6,7 @@
|
|
6 |
*.ipynb
|
7 |
examples/*
|
8 |
processing/*
|
|
|
9 |
output/*
|
10 |
tools/__pycache__/*
|
11 |
old_code/*
|
|
|
6 |
*.ipynb
|
7 |
examples/*
|
8 |
processing/*
|
9 |
+
input/*
|
10 |
output/*
|
11 |
tools/__pycache__/*
|
12 |
old_code/*
|
.gitignore
CHANGED
@@ -6,6 +6,7 @@
|
|
6 |
*.ipynb
|
7 |
examples/*
|
8 |
processing/*
|
|
|
9 |
output/*
|
10 |
tools/__pycache__/*
|
11 |
old_code/*
|
|
|
6 |
*.ipynb
|
7 |
examples/*
|
8 |
processing/*
|
9 |
+
input/*
|
10 |
output/*
|
11 |
tools/__pycache__/*
|
12 |
old_code/*
|
Dockerfile
CHANGED
@@ -52,6 +52,7 @@ RUN useradd -m -u 1000 user
|
|
52 |
|
53 |
# Create required directories
|
54 |
RUN mkdir -p /home/user/app/output \
|
|
|
55 |
&& mkdir -p /home/user/app/tld \
|
56 |
&& mkdir -p /home/user/app/logs \
|
57 |
&& chown -R user:user /home/user/app
|
|
|
52 |
|
53 |
# Create required directories
|
54 |
RUN mkdir -p /home/user/app/output \
|
55 |
+
&& mkdir -p /home/user/app/input \
|
56 |
&& mkdir -p /home/user/app/tld \
|
57 |
&& mkdir -p /home/user/app/logs \
|
58 |
&& chown -R user:user /home/user/app
|
app.py
CHANGED
@@ -54,7 +54,7 @@ else:
|
|
54 |
default_pii_detector = local_pii_detector
|
55 |
|
56 |
# Create the gradio interface
|
57 |
-
app = gr.Blocks(theme = gr.themes.Base())
|
58 |
|
59 |
with app:
|
60 |
|
@@ -67,7 +67,7 @@ with app:
|
|
67 |
all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
|
68 |
all_decision_process_table_state = gr.State(pd.DataFrame())
|
69 |
|
70 |
-
|
71 |
|
72 |
session_hash_state = gr.State()
|
73 |
s3_output_folder_state = gr.State()
|
@@ -106,15 +106,7 @@ with app:
|
|
106 |
estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
107 |
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
108 |
|
109 |
-
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
110 |
-
|
111 |
-
## S3 default bucket and allow list file state
|
112 |
-
default_allow_list_file_name = "default_allow_list.csv"
|
113 |
-
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
114 |
-
|
115 |
-
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
116 |
-
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
|
117 |
-
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
118 |
|
119 |
|
120 |
## Annotator zoom value
|
@@ -125,6 +117,25 @@ with app:
|
|
125 |
clear_all_page_redactions = gr.State(True)
|
126 |
prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
|
127 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
###
|
130 |
# UI DESIGN
|
@@ -172,6 +183,10 @@ with app:
|
|
172 |
# Object annotation
|
173 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
174 |
|
|
|
|
|
|
|
|
|
175 |
with gr.Row():
|
176 |
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
177 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
@@ -203,9 +218,7 @@ with app:
|
|
203 |
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
204 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
205 |
|
206 |
-
|
207 |
-
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...redactions.json)")
|
208 |
-
|
209 |
# TEXT / TABULAR DATA TAB
|
210 |
with gr.Tab(label="Open text or Excel/csv files"):
|
211 |
gr.Markdown(
|
@@ -236,8 +249,6 @@ with app:
|
|
236 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
237 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
238 |
|
239 |
-
|
240 |
-
|
241 |
# SETTINGS TAB
|
242 |
with gr.Tab(label="Redaction settings"):
|
243 |
gr.Markdown(
|
@@ -250,14 +261,18 @@ with app:
|
|
250 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
251 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
252 |
|
253 |
-
|
254 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
255 |
with gr.Row():
|
256 |
-
|
257 |
-
|
258 |
-
gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
|
259 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
with gr.Accordion("Add or remove entity types to redact", open = False):
|
262 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
|
263 |
|
@@ -266,15 +281,11 @@ with app:
|
|
266 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
|
267 |
#with gr.Row():
|
268 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
269 |
-
|
270 |
|
271 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
272 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
273 |
|
274 |
-
log_files_output = gr.File(label="Log file output", interactive=False)
|
275 |
-
|
276 |
-
# If a custom allow list is uploaded
|
277 |
-
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
278 |
|
279 |
###
|
280 |
# PDF/IMAGE REDACTION
|
@@ -283,25 +294,22 @@ with app:
|
|
283 |
|
284 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
285 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
|
286 |
-
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
287 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
288 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
289 |
|
290 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
291 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
292 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
293 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
294 |
|
295 |
# If a file has been completed, the function will continue onto the next document
|
296 |
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
|
297 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
298 |
-
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
|
299 |
-
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
300 |
-
# outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
|
301 |
-
#then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
|
302 |
-
#then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
303 |
|
304 |
-
###
|
|
|
|
|
305 |
|
306 |
# Page controls at top
|
307 |
annotate_current_page.submit(
|
@@ -326,7 +334,7 @@ with app:
|
|
326 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
327 |
|
328 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
329 |
-
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
330 |
|
331 |
# Page controls at bottom
|
332 |
annotate_current_page_bottom.submit(
|
@@ -355,6 +363,16 @@ with app:
|
|
355 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
356 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
358 |
###
|
359 |
# APP LOAD AND LOGGING
|
360 |
###
|
|
|
54 |
default_pii_detector = local_pii_detector
|
55 |
|
56 |
# Create the gradio interface
|
57 |
+
app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
|
58 |
|
59 |
with app:
|
60 |
|
|
|
67 |
all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
|
68 |
all_decision_process_table_state = gr.State(pd.DataFrame())
|
69 |
|
70 |
+
|
71 |
|
72 |
session_hash_state = gr.State()
|
73 |
s3_output_folder_state = gr.State()
|
|
|
106 |
estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
|
107 |
annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
|
108 |
|
109 |
+
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
|
112 |
## Annotator zoom value
|
|
|
117 |
clear_all_page_redactions = gr.State(True)
|
118 |
prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
|
119 |
|
120 |
+
## Settings page variables
|
121 |
+
default_allow_list_file_name = "default_allow_list.csv"
|
122 |
+
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
123 |
+
in_allow_list_state = gr.State(pd.DataFrame())
|
124 |
+
|
125 |
+
default_deny_list_file_name = "default_deny_list.csv"
|
126 |
+
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
127 |
+
in_deny_list_state = gr.State(pd.DataFrame())
|
128 |
+
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
129 |
+
|
130 |
+
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
131 |
+
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
132 |
+
in_fully_redacted_list_state = gr.State(pd.DataFrame())
|
133 |
+
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
134 |
+
|
135 |
+
# S3 settings for default allow list load
|
136 |
+
s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
|
137 |
+
s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
|
138 |
+
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
139 |
|
140 |
###
|
141 |
# UI DESIGN
|
|
|
183 |
# Object annotation
|
184 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
185 |
|
186 |
+
with gr.Accordion(label = "Review previous redactions", open=True):
|
187 |
+
output_review_files = gr.File(label="Review output files", file_count='multiple')
|
188 |
+
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
|
189 |
+
|
190 |
with gr.Row():
|
191 |
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
192 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
|
|
218 |
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
219 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
220 |
|
221 |
+
|
|
|
|
|
222 |
# TEXT / TABULAR DATA TAB
|
223 |
with gr.Tab(label="Open text or Excel/csv files"):
|
224 |
gr.Markdown(
|
|
|
249 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
250 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
251 |
|
|
|
|
|
252 |
# SETTINGS TAB
|
253 |
with gr.Tab(label="Redaction settings"):
|
254 |
gr.Markdown(
|
|
|
261 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
262 |
page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
|
263 |
|
|
|
264 |
with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
|
265 |
with gr.Row():
|
266 |
+
with gr.Column():
|
267 |
+
in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.", file_count="multiple", height=50)
|
|
|
268 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
269 |
+
with gr.Column():
|
270 |
+
in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will always be redacted.", file_count="multiple", height=50)
|
271 |
+
in_deny_list_text = gr.Textbox(label="Custom deny list load status")
|
272 |
+
with gr.Column():
|
273 |
+
in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=50)
|
274 |
+
in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
|
275 |
+
|
276 |
with gr.Accordion("Add or remove entity types to redact", open = False):
|
277 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
|
278 |
|
|
|
281 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
|
282 |
#with gr.Row():
|
283 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
|
|
284 |
|
285 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
|
286 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
287 |
|
288 |
+
log_files_output = gr.File(label="Log file output", interactive=False)
|
|
|
|
|
|
|
289 |
|
290 |
###
|
291 |
# PDF/IMAGE REDACTION
|
|
|
294 |
|
295 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
296 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
|
297 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
298 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
299 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
300 |
|
301 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
302 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
303 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
304 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
305 |
|
306 |
# If a file has been completed, the function will continue onto the next document
|
307 |
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
|
308 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
|
|
|
|
|
|
|
|
|
|
309 |
|
310 |
+
###
|
311 |
+
# REVIEW PDF REDACTIONS
|
312 |
+
###
|
313 |
|
314 |
# Page controls at top
|
315 |
annotate_current_page.submit(
|
|
|
334 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
|
335 |
|
336 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
337 |
+
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
338 |
|
339 |
# Page controls at bottom
|
340 |
annotate_current_page_bottom.submit(
|
|
|
363 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
364 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
365 |
|
366 |
+
###
|
367 |
+
# SETTINGS PAGE INPUT / OUTPUT
|
368 |
+
###
|
369 |
+
# If a custom allow list is uploaded
|
370 |
+
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
371 |
+
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
372 |
+
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
373 |
+
|
374 |
+
|
375 |
+
|
376 |
###
|
377 |
# APP LOAD AND LOGGING
|
378 |
###
|
tools/file_conversion.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
-
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
import os
|
6 |
import re
|
7 |
-
import gradio as gr
|
8 |
import time
|
9 |
import json
|
10 |
import pymupdf
|
|
|
11 |
from tqdm import tqdm
|
12 |
from gradio import Progress
|
13 |
from typing import List, Optional
|
@@ -48,10 +48,15 @@ def is_pdf(filename):
|
|
48 |
|
49 |
|
50 |
|
51 |
-
def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple[int, str]:
|
52 |
try:
|
53 |
-
|
|
|
|
|
|
|
|
|
54 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
|
55 |
if os.path.exists(out_path):
|
56 |
print(f"Loading existing image for page {page_num + 1}")
|
57 |
image = Image.open(out_path)
|
@@ -67,7 +72,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple
|
|
67 |
print(f"Error processing page {page_num + 1}: {e}")
|
68 |
return page_num, None
|
69 |
|
70 |
-
def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
|
71 |
|
72 |
# If preparing for review, just load the first page
|
73 |
if prepare_for_review == True:
|
@@ -252,6 +257,7 @@ def prepare_image_or_pdf(
|
|
252 |
"""
|
253 |
|
254 |
tic = time.perf_counter()
|
|
|
255 |
|
256 |
# If this is the first time around, set variables to 0/blank
|
257 |
if first_loop_state==True:
|
@@ -341,10 +347,15 @@ def prepare_image_or_pdf(
|
|
341 |
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
342 |
in_redact_method = tesseract_ocr_option
|
343 |
|
|
|
|
|
|
|
|
|
|
|
344 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
345 |
-
if
|
346 |
|
347 |
-
if prepare_for_review == True:
|
348 |
print("Preparing file for review")
|
349 |
if isinstance(file_path, str):
|
350 |
with open(file_path, 'r') as json_file:
|
@@ -353,6 +364,20 @@ def prepare_image_or_pdf(
|
|
353 |
# Assuming file_path is a NamedString or similar
|
354 |
all_annotations_object = json.loads(file_path) # Use loads for string content
|
355 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
356 |
# Get list of page numbers
|
357 |
image_file_paths_pages = [
|
358 |
int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
|
@@ -380,19 +405,11 @@ def prepare_image_or_pdf(
|
|
380 |
#print("all_annotations_object:", all_annotations_object)
|
381 |
|
382 |
# Write the response to a JSON file in output folder
|
383 |
-
out_folder = output_folder + file_path_without_ext +
|
384 |
with open(out_folder, 'w') as json_file:
|
385 |
json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
386 |
continue
|
387 |
-
|
388 |
-
else:
|
389 |
-
# If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
|
390 |
-
json_contents = json.load(file_path)
|
391 |
-
# Write the response to a JSON file in output folder
|
392 |
-
out_folder = output_folder + file_path_without_ext + file_extension
|
393 |
-
with open(out_folder, 'w') as json_file:
|
394 |
-
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
395 |
-
continue
|
396 |
|
397 |
# Must be a pdf or image at this point
|
398 |
else:
|
@@ -428,7 +445,6 @@ def prepare_image_or_pdf(
|
|
428 |
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
429 |
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
430 |
|
431 |
-
|
432 |
toc = time.perf_counter()
|
433 |
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
434 |
|
@@ -467,3 +483,55 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
|
|
467 |
#print("Out file paths:", out_file_paths)
|
468 |
|
469 |
return out_message, out_file_paths
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
2 |
+
from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, read_file
|
3 |
from PIL import Image, ImageFile
|
4 |
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
5 |
import os
|
6 |
import re
|
|
|
7 |
import time
|
8 |
import json
|
9 |
import pymupdf
|
10 |
+
import pandas as pd
|
11 |
from tqdm import tqdm
|
12 |
from gradio import Progress
|
13 |
from typing import List, Optional
|
|
|
48 |
|
49 |
|
50 |
|
51 |
+
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
52 |
try:
|
53 |
+
# Construct the full output directory path relative to the current working directory
|
54 |
+
output_dir = os.path.join(os.getcwd(), output_dir)
|
55 |
+
|
56 |
+
# Use the output_dir to construct the out_path
|
57 |
+
out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
|
58 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
59 |
+
|
60 |
if os.path.exists(out_path):
|
61 |
print(f"Loading existing image for page {page_num + 1}")
|
62 |
image = Image.open(out_path)
|
|
|
72 |
print(f"Error processing page {page_num + 1}: {e}")
|
73 |
return page_num, None
|
74 |
|
75 |
+
def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8, output_dir: str = '/input'):
|
76 |
|
77 |
# If preparing for review, just load the first page
|
78 |
if prepare_for_review == True:
|
|
|
257 |
"""
|
258 |
|
259 |
tic = time.perf_counter()
|
260 |
+
json_from_csv = False
|
261 |
|
262 |
# If this is the first time around, set variables to 0/blank
|
263 |
if first_loop_state==True:
|
|
|
347 |
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
348 |
in_redact_method = tesseract_ocr_option
|
349 |
|
350 |
+
if file_extension in ['.csv']:
|
351 |
+
review_file_csv = read_file(file)
|
352 |
+
all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
|
353 |
+
json_from_csv = True
|
354 |
+
|
355 |
# If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
|
356 |
+
if (file_extension in ['.json']) | (json_from_csv == True):
|
357 |
|
358 |
+
if (file_extension in ['.json']) & (prepare_for_review == True):
|
359 |
print("Preparing file for review")
|
360 |
if isinstance(file_path, str):
|
361 |
with open(file_path, 'r') as json_file:
|
|
|
364 |
# Assuming file_path is a NamedString or similar
|
365 |
all_annotations_object = json.loads(file_path) # Use loads for string content
|
366 |
|
367 |
+
# Assume it's a textract json
|
368 |
+
elif (file_extension in ['.json']) & (prepare_for_review != True):
|
369 |
+
# If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
|
370 |
+
json_contents = json.load(file_path)
|
371 |
+
# Write the response to a JSON file in output folder
|
372 |
+
out_folder = output_folder + file_path_without_ext + ".json"
|
373 |
+
with open(out_folder, 'w') as json_file:
|
374 |
+
json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
375 |
+
continue
|
376 |
+
|
377 |
+
# If you have an annotations object from the above code
|
378 |
+
if all_annotations_object:
|
379 |
+
#print("out_annotations_object found:", all_annotations_object)
|
380 |
+
|
381 |
# Get list of page numbers
|
382 |
image_file_paths_pages = [
|
383 |
int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
|
|
|
405 |
#print("all_annotations_object:", all_annotations_object)
|
406 |
|
407 |
# Write the response to a JSON file in output folder
|
408 |
+
out_folder = output_folder + file_path_without_ext + ".json"
|
409 |
with open(out_folder, 'w') as json_file:
|
410 |
json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
411 |
continue
|
412 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
413 |
|
414 |
# Must be a pdf or image at this point
|
415 |
else:
|
|
|
445 |
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
446 |
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
447 |
|
|
|
448 |
toc = time.perf_counter()
|
449 |
out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
|
450 |
|
|
|
483 |
#print("Out file paths:", out_file_paths)
|
484 |
|
485 |
return out_message, out_file_paths
|
486 |
+
|
487 |
+
|
488 |
+
def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
|
489 |
+
# Flatten the data
|
490 |
+
flattened_data = []
|
491 |
+
|
492 |
+
for entry in data:
|
493 |
+
#print("entry:", entry)
|
494 |
+
#print("flattened_data:", flattened_data)
|
495 |
+
image_path = entry["image"]
|
496 |
+
|
497 |
+
# Use regex to find the number before .png
|
498 |
+
match = re.search(r'_(\d+)\.png$', image_path)
|
499 |
+
if match:
|
500 |
+
number = match.group(1) # Extract the number
|
501 |
+
print(number) # Output: 0
|
502 |
+
reported_number = int(number) + 1
|
503 |
+
else:
|
504 |
+
print("No number found before .png")
|
505 |
+
|
506 |
+
for box in entry["boxes"]:
|
507 |
+
data_to_add = {"image": image_path, "page":reported_number, **box}
|
508 |
+
#print("data_to_add:", data_to_add)
|
509 |
+
flattened_data.append(data_to_add)
|
510 |
+
|
511 |
+
# Convert to a DataFrame
|
512 |
+
df = pd.DataFrame(flattened_data)
|
513 |
+
|
514 |
+
return df
|
515 |
+
|
516 |
+
def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
|
517 |
+
# Keep only necessary columns
|
518 |
+
df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
519 |
+
|
520 |
+
# Group the DataFrame by the 'image' column
|
521 |
+
grouped = df.groupby('image')
|
522 |
+
|
523 |
+
# Create a list to hold the JSON data
|
524 |
+
json_data = []
|
525 |
+
|
526 |
+
# Iterate over each group
|
527 |
+
for image_path, group in grouped:
|
528 |
+
# Convert each group to a list of box dictionaries
|
529 |
+
boxes = group.drop(columns=['image', 'page']).to_dict(orient='records')
|
530 |
+
|
531 |
+
# Append the structured data to the json_data list
|
532 |
+
json_data.append({
|
533 |
+
"image": image_path,
|
534 |
+
"boxes": boxes
|
535 |
+
})
|
536 |
+
|
537 |
+
return json_data
|
tools/file_redaction.py
CHANGED
@@ -4,6 +4,7 @@ import json
|
|
4 |
import io
|
5 |
import os
|
6 |
import boto3
|
|
|
7 |
|
8 |
from tqdm import tqdm
|
9 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
@@ -25,7 +26,7 @@ from collections import defaultdict # For efficient grouping
|
|
25 |
from presidio_analyzer import RecognizerResult
|
26 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
27 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
28 |
-
from tools.file_conversion import process_file, image_dpi
|
29 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
30 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
31 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
@@ -68,6 +69,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
68 |
chosen_redact_comprehend_entities:List[str],
|
69 |
in_redact_method:str,
|
70 |
in_allow_list:List[List[str]]=None,
|
|
|
|
|
71 |
latest_file_completed:int=0,
|
72 |
out_message:list=[],
|
73 |
out_file_paths:list=[],
|
@@ -99,6 +102,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
99 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
|
100 |
- in_redact_method (str): The method to use for redaction.
|
101 |
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
|
|
|
|
102 |
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
103 |
- out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
104 |
- out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
|
@@ -188,7 +193,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
188 |
|
189 |
if not in_allow_list.empty:
|
190 |
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
191 |
-
print("In allow list:", in_allow_list_flat)
|
192 |
else:
|
193 |
in_allow_list_flat = []
|
194 |
|
@@ -236,7 +241,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
236 |
file_paths_list = file_paths
|
237 |
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
238 |
|
239 |
-
print("file_paths_list in choose_redactor function:", file_paths_list)
|
240 |
|
241 |
|
242 |
for file in file_paths_loop:
|
@@ -269,7 +274,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
269 |
|
270 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
271 |
|
272 |
-
pymupdf_doc,all_decision_process_table,
|
273 |
prepared_pdf_image_paths,
|
274 |
language,
|
275 |
chosen_redact_entities,
|
@@ -300,7 +305,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
300 |
|
301 |
elif in_redact_method == text_ocr_option:
|
302 |
|
303 |
-
|
304 |
|
305 |
if is_pdf(file_path) == False:
|
306 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
@@ -353,12 +358,12 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
353 |
|
354 |
out_file_paths.append(out_image_file_path)
|
355 |
|
356 |
-
if
|
357 |
-
|
358 |
|
359 |
logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
|
360 |
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
361 |
-
|
362 |
|
363 |
all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
|
364 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
@@ -366,12 +371,23 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
366 |
|
367 |
# Save the gradio_annotation_boxes to a JSON file
|
368 |
try:
|
369 |
-
|
|
|
|
|
370 |
with open(out_annotation_file_path, 'w') as f:
|
371 |
json.dump(annotations_all_pages, f)
|
372 |
-
|
373 |
-
|
374 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
|
376 |
# Make a combined message for the file
|
377 |
if isinstance(out_message, list):
|
@@ -578,7 +594,50 @@ def move_page_info(file_path: str) -> str:
|
|
578 |
|
579 |
return new_file_path
|
580 |
|
581 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
582 |
|
583 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
584 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
@@ -669,40 +728,42 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None, custo
|
|
669 |
|
670 |
all_image_annotation_boxes.append(img_annotation_box)
|
671 |
|
672 |
-
|
673 |
-
#print("Rect:", rect)
|
674 |
-
#middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
675 |
-
rect_small_pixel_height = Rect(pymupdf_x1, pymupdf_y1 + 2, pymupdf_x2, pymupdf_y2 - 2) # Slightly smaller than outside box
|
676 |
|
677 |
-
|
678 |
-
|
679 |
-
page
|
|
|
|
|
|
|
|
|
680 |
|
681 |
-
|
682 |
-
shape = page.new_shape()
|
683 |
-
shape.draw_rect(rect)
|
684 |
|
685 |
-
|
|
|
686 |
|
687 |
-
|
688 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
689 |
|
690 |
-
|
691 |
-
out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
|
692 |
-
else:
|
693 |
-
out_colour = img_annotation_box["color"]
|
694 |
-
else:
|
695 |
-
out_colour = (0,0,0)
|
696 |
|
697 |
-
|
698 |
-
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
699 |
-
shape.commit()
|
700 |
|
701 |
out_annotation_boxes = {
|
702 |
"image": image_path, #Image.open(image_path), #image_path,
|
703 |
"boxes": all_image_annotation_boxes
|
704 |
}
|
705 |
|
|
|
|
|
|
|
706 |
page.apply_redactions(images=0, graphics=0)
|
707 |
page.clean_contents()
|
708 |
|
@@ -713,33 +774,38 @@ def bounding_boxes_overlap(box1, box2):
|
|
713 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
714 |
box1[1] < box2[3] and box2[1] < box1[3])
|
715 |
|
|
|
|
|
|
|
|
|
716 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
|
|
|
|
717 |
merged_bboxes = []
|
718 |
grouped_bboxes = defaultdict(list)
|
719 |
|
720 |
-
|
|
|
|
|
|
|
721 |
if signature_recogniser_results or handwriting_recogniser_results:
|
722 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
723 |
-
|
724 |
-
merged_bboxes.extend(handwriting_recogniser_results)
|
725 |
|
726 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
727 |
-
|
728 |
-
merged_bboxes.extend(signature_recogniser_results)
|
729 |
-
|
730 |
|
731 |
# Reconstruct bounding boxes for substrings of interest
|
732 |
reconstructed_bboxes = []
|
733 |
for bbox in bboxes:
|
734 |
-
#print("bbox:", bbox)
|
735 |
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
736 |
for line_text, line_info in combined_results.items():
|
737 |
line_box = line_info['bounding_box']
|
738 |
-
if bounding_boxes_overlap(bbox_box, line_box):
|
739 |
if bbox.text in line_text:
|
740 |
start_char = line_text.index(bbox.text)
|
741 |
end_char = start_char + len(bbox.text)
|
742 |
-
|
743 |
relevant_words = []
|
744 |
current_char = 0
|
745 |
for word in line_info['words']:
|
@@ -753,16 +819,13 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
753 |
current_char += 1 # +1 for space if the word doesn't already end with a space
|
754 |
|
755 |
if relevant_words:
|
756 |
-
#print("Relevant words:", relevant_words)
|
757 |
left = min(word['bounding_box'][0] for word in relevant_words)
|
758 |
top = min(word['bounding_box'][1] for word in relevant_words)
|
759 |
right = max(word['bounding_box'][2] for word in relevant_words)
|
760 |
bottom = max(word['bounding_box'][3] for word in relevant_words)
|
761 |
-
|
762 |
-
# Combine the text of all relevant words
|
763 |
combined_text = " ".join(word['text'] for word in relevant_words)
|
764 |
|
765 |
-
# Calculate new dimensions for the merged box
|
766 |
reconstructed_bbox = CustomImageRecognizerResult(
|
767 |
bbox.entity_type,
|
768 |
bbox.start,
|
@@ -771,13 +834,13 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
771 |
left,
|
772 |
top,
|
773 |
right - left, # width
|
774 |
-
bottom - top, # height
|
775 |
combined_text
|
776 |
)
|
777 |
-
reconstructed_bboxes.append(
|
|
|
778 |
break
|
779 |
else:
|
780 |
-
# If the bbox text is not found in any line in combined_results, keep the original bbox
|
781 |
reconstructed_bboxes.append(bbox)
|
782 |
|
783 |
# Group reconstructed bboxes by approximate vertical proximity
|
@@ -791,35 +854,141 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
791 |
merged_box = group[0]
|
792 |
for next_box in group[1:]:
|
793 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
794 |
-
|
795 |
-
|
796 |
-
new_text = merged_box.text
|
797 |
-
else:
|
798 |
-
new_text = merged_box.text + " " + next_box.text
|
799 |
-
|
800 |
-
if merged_box.text == next_box.text:
|
801 |
-
new_text = merged_box.text
|
802 |
-
new_entity_type = merged_box.entity_type # Keep the original entity type
|
803 |
-
else:
|
804 |
-
new_text = merged_box.text + " " + next_box.text
|
805 |
-
new_entity_type = merged_box.entity_type + " - " + next_box.entity_type # Concatenate entity types
|
806 |
|
807 |
new_left = min(merged_box.left, next_box.left)
|
808 |
new_top = min(merged_box.top, next_box.top)
|
809 |
new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
|
810 |
new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
|
|
|
811 |
merged_box = CustomImageRecognizerResult(
|
812 |
new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
|
813 |
)
|
814 |
else:
|
815 |
merged_bboxes.append(merged_box)
|
816 |
-
merged_box = next_box
|
817 |
|
818 |
merged_bboxes.append(merged_box)
|
819 |
|
820 |
-
|
821 |
-
|
822 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
823 |
|
824 |
def redact_image_pdf(file_path:str,
|
825 |
prepared_pdf_file_paths:List[str],
|
@@ -846,7 +1015,7 @@ def redact_image_pdf(file_path:str,
|
|
846 |
custom_recogniser_word_list:List[str]=[],
|
847 |
redact_whole_page_list:List[str]=[],
|
848 |
page_break_val:int=int(page_break_value),
|
849 |
-
|
850 |
max_time:int=int(max_time_value),
|
851 |
progress=Progress(track_tqdm=True)):
|
852 |
|
@@ -878,7 +1047,7 @@ def redact_image_pdf(file_path:str,
|
|
878 |
- custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
|
879 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
880 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
881 |
-
-
|
882 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
883 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
884 |
|
@@ -901,12 +1070,12 @@ def redact_image_pdf(file_path:str,
|
|
901 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
902 |
print("Connection to AWS Comprehend service unsuccessful.")
|
903 |
|
904 |
-
return pymupdf_doc, all_decision_process_table,
|
905 |
|
906 |
if analysis_type == textract_option and textract_client == "":
|
907 |
print("Connection to AWS Textract service unsuccessful.")
|
908 |
|
909 |
-
return pymupdf_doc, all_decision_process_table,
|
910 |
|
911 |
tic = time.perf_counter()
|
912 |
|
@@ -937,14 +1106,14 @@ def redact_image_pdf(file_path:str,
|
|
937 |
if analysis_type == textract_option:
|
938 |
|
939 |
json_file_path = output_folder + file_name + "_textract.json"
|
940 |
-
|
941 |
|
942 |
if not os.path.exists(json_file_path):
|
943 |
no_textract_file = True
|
944 |
print("No existing Textract results file found.")
|
945 |
existing_data = {}
|
946 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
947 |
-
#
|
948 |
#request_metadata = request_metadata + "\n" + new_request_metadata
|
949 |
#wrapped_text_blocks = {"pages":[text_blocks]}
|
950 |
else:
|
@@ -1015,7 +1184,7 @@ def redact_image_pdf(file_path:str,
|
|
1015 |
|
1016 |
if not existing_data:
|
1017 |
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1018 |
-
|
1019 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
1020 |
|
1021 |
existing_data = {"pages":[text_blocks]}
|
@@ -1043,7 +1212,7 @@ def redact_image_pdf(file_path:str,
|
|
1043 |
|
1044 |
# if not os.path.exists(json_file_path):
|
1045 |
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1046 |
-
#
|
1047 |
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1048 |
|
1049 |
# existing_data = {"pages":[text_blocks]}
|
@@ -1073,7 +1242,7 @@ def redact_image_pdf(file_path:str,
|
|
1073 |
# with open(json_file_path, 'w') as json_file:
|
1074 |
# json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1075 |
|
1076 |
-
#
|
1077 |
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1078 |
# else:
|
1079 |
# # If the page exists, retrieve the data
|
@@ -1204,7 +1373,7 @@ def redact_image_pdf(file_path:str,
|
|
1204 |
|
1205 |
current_loop_page += 1
|
1206 |
|
1207 |
-
return pymupdf_doc, all_decision_process_table,
|
1208 |
|
1209 |
if is_pdf(file_path) == False:
|
1210 |
images.append(image)
|
@@ -1225,7 +1394,7 @@ def redact_image_pdf(file_path:str,
|
|
1225 |
with open(json_file_path, 'w') as json_file:
|
1226 |
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1227 |
|
1228 |
-
return pymupdf_doc, all_decision_process_table,
|
1229 |
|
1230 |
if analysis_type == textract_option:
|
1231 |
# Write the updated existing textract data back to the JSON file
|
@@ -1233,7 +1402,7 @@ def redact_image_pdf(file_path:str,
|
|
1233 |
with open(json_file_path, 'w') as json_file:
|
1234 |
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1235 |
|
1236 |
-
return pymupdf_doc, all_decision_process_table,
|
1237 |
|
1238 |
|
1239 |
###
|
@@ -1349,16 +1518,18 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
|
|
1349 |
|
1350 |
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
1351 |
|
1352 |
-
def merge_text_bounding_boxes(analyser_results
|
1353 |
'''
|
1354 |
Merge identified bounding boxes containing PII that are very close to one another
|
1355 |
'''
|
1356 |
analysed_bounding_boxes = []
|
|
|
|
|
1357 |
if len(analyser_results) > 0 and len(characters) > 0:
|
1358 |
# Extract bounding box coordinates for sorting
|
1359 |
bounding_boxes = []
|
1360 |
-
text_out = []
|
1361 |
for result in analyser_results:
|
|
|
1362 |
char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1363 |
char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1364 |
if char_boxes:
|
@@ -1367,9 +1538,12 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
|
|
1367 |
bottom = min(box[1] for box in char_boxes)
|
1368 |
right = max(box[2] for box in char_boxes)
|
1369 |
top = max(box[3] for box in char_boxes) + vertical_padding
|
1370 |
-
|
|
|
1371 |
|
1372 |
-
|
|
|
|
|
1373 |
|
1374 |
# Sort the results by y-coordinate and then by x-coordinate
|
1375 |
bounding_boxes.sort()
|
@@ -1380,74 +1554,163 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
|
|
1380 |
current_result = None
|
1381 |
current_text = []
|
1382 |
|
1383 |
-
for y, x, result,
|
1384 |
-
#print(f"Considering result: {result}")
|
1385 |
-
#print(f"Character box: {char_box}")
|
1386 |
-
|
1387 |
if current_y is None or current_box is None:
|
1388 |
-
|
1389 |
-
|
|
|
1390 |
current_result = result
|
1391 |
current_text = list(text)
|
1392 |
-
#print(f"Starting new box: {current_box}")
|
1393 |
else:
|
1394 |
-
vertical_diff_bboxes = abs(
|
1395 |
-
horizontal_diff_bboxes = abs(
|
1396 |
-
|
1397 |
-
|
1398 |
-
|
1399 |
-
|
1400 |
-
|
1401 |
-
|
1402 |
-
|
1403 |
-
|
1404 |
-
current_box[2]
|
1405 |
-
|
1406 |
-
|
|
|
|
|
|
|
1407 |
try:
|
1408 |
-
|
1409 |
except Exception as e:
|
1410 |
-
print("Unable to combine result entity types:")
|
1411 |
-
print(e)
|
1412 |
-
# Add a space if current_text is not empty
|
1413 |
if current_text:
|
1414 |
-
|
1415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1416 |
|
1417 |
-
#print(f"Latest merged box: {current_box[-1]}")
|
1418 |
else:
|
1419 |
-
|
1420 |
-
|
1421 |
-
#
|
1422 |
-
#
|
1423 |
-
|
1424 |
-
#
|
1425 |
-
|
1426 |
-
|
|
|
1427 |
current_result = result
|
1428 |
current_text = list(text)
|
1429 |
-
#print(f"Starting new box: {current_box}")
|
1430 |
-
|
1431 |
-
# After finishing with the current result, add the last box for this result
|
1432 |
-
if current_box:
|
1433 |
-
merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
|
1434 |
-
#print(f"Appending final box for result: {current_box}")
|
1435 |
-
|
1436 |
-
if not merged_bounding_boxes:
|
1437 |
-
analysed_bounding_boxes.extend(
|
1438 |
-
{"text":text, "boundingBox": char.bbox, "result": result}
|
1439 |
-
for result in analyser_results
|
1440 |
-
for char in characters[result.start:result.end]
|
1441 |
-
if isinstance(char, LTChar)
|
1442 |
-
)
|
1443 |
-
else:
|
1444 |
-
analysed_bounding_boxes.extend(merged_bounding_boxes)
|
1445 |
|
1446 |
-
#
|
1447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1448 |
return analysed_bounding_boxes
|
1449 |
|
1450 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1451 |
|
1452 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1453 |
decision_process_table = pd.DataFrame()
|
|
|
4 |
import io
|
5 |
import os
|
6 |
import boto3
|
7 |
+
import copy
|
8 |
|
9 |
from tqdm import tqdm
|
10 |
from PIL import Image, ImageChops, ImageFile, ImageDraw
|
|
|
26 |
from presidio_analyzer import RecognizerResult
|
27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
29 |
+
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df
|
30 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
31 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
|
|
69 |
chosen_redact_comprehend_entities:List[str],
|
70 |
in_redact_method:str,
|
71 |
in_allow_list:List[List[str]]=None,
|
72 |
+
in_deny_list:List[List[str]]=None,
|
73 |
+
in_fully_redacted_list:List[List[str]]=None,
|
74 |
latest_file_completed:int=0,
|
75 |
out_message:list=[],
|
76 |
out_file_paths:list=[],
|
|
|
102 |
- chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
|
103 |
- in_redact_method (str): The method to use for redaction.
|
104 |
- in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
105 |
+
- in_deny_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
106 |
+
- in_fully_redacted_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
|
107 |
- latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
|
108 |
- out_message (list, optional): A list to store output messages. Defaults to an empty list.
|
109 |
- out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
|
|
|
193 |
|
194 |
if not in_allow_list.empty:
|
195 |
in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
|
196 |
+
#print("In allow list:", in_allow_list_flat)
|
197 |
else:
|
198 |
in_allow_list_flat = []
|
199 |
|
|
|
241 |
file_paths_list = file_paths
|
242 |
file_paths_loop = [file_paths_list[int(latest_file_completed)]]
|
243 |
|
244 |
+
# print("file_paths_list in choose_redactor function:", file_paths_list)
|
245 |
|
246 |
|
247 |
for file in file_paths_loop:
|
|
|
274 |
|
275 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
276 |
|
277 |
+
pymupdf_doc,all_decision_process_table,log_files_output_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
278 |
prepared_pdf_image_paths,
|
279 |
language,
|
280 |
chosen_redact_entities,
|
|
|
305 |
|
306 |
elif in_redact_method == text_ocr_option:
|
307 |
|
308 |
+
#log_files_output_paths = []
|
309 |
|
310 |
if is_pdf(file_path) == False:
|
311 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
|
|
358 |
|
359 |
out_file_paths.append(out_image_file_path)
|
360 |
|
361 |
+
#if log_files_output_paths:
|
362 |
+
# log_files_output_paths.extend(log_files_output_paths)
|
363 |
|
364 |
logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
|
365 |
all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
|
366 |
+
log_files_output_paths.append(logs_output_file_name)
|
367 |
|
368 |
all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
|
369 |
all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
|
|
|
371 |
|
372 |
# Save the gradio_annotation_boxes to a JSON file
|
373 |
try:
|
374 |
+
print("Saving annotations to JSON")
|
375 |
+
|
376 |
+
out_annotation_file_path = out_image_file_path + '_review_file.json'
|
377 |
with open(out_annotation_file_path, 'w') as f:
|
378 |
json.dump(annotations_all_pages, f)
|
379 |
+
log_files_output_paths.append(out_annotation_file_path)
|
380 |
+
|
381 |
+
print("Saving annotations to CSV")
|
382 |
+
|
383 |
+
# Convert json to csv and also save this
|
384 |
+
review_df = convert_review_json_to_pandas_df(annotations_all_pages)
|
385 |
+
out_review_file_file_path = out_image_file_path + '_review_file.csv'
|
386 |
+
review_df.to_csv(out_review_file_file_path, index=None)
|
387 |
+
out_file_paths.append(out_review_file_file_path)
|
388 |
+
|
389 |
+
except Exception as e:
|
390 |
+
print("Could not save annotations to json file:", e)
|
391 |
|
392 |
# Make a combined message for the file
|
393 |
if isinstance(out_message, list):
|
|
|
594 |
|
595 |
return new_file_path
|
596 |
|
597 |
+
def convert_color_to_range_0_1(color):
|
598 |
+
return tuple(component / 255 for component in color)
|
599 |
+
|
600 |
+
def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
|
601 |
+
pymupdf_x1 = pymupdf_rect[0]
|
602 |
+
pymupdf_y1 = pymupdf_rect[1]
|
603 |
+
pymupdf_x2 = pymupdf_rect[2]
|
604 |
+
pymupdf_y2 = pymupdf_rect[3]
|
605 |
+
|
606 |
+
# Calculate area to actually remove text from the pdf (different from black box size)
|
607 |
+
redact_bottom_y = pymupdf_y1 + 2
|
608 |
+
redact_top_y = pymupdf_y2 - 2
|
609 |
+
|
610 |
+
# Calculate the middle y value and set a small height if default values are too close together
|
611 |
+
if (redact_top_y - redact_bottom_y) < 1:
|
612 |
+
middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
613 |
+
redact_bottom_y = middle_y - 1
|
614 |
+
redact_top_y = middle_y + 1
|
615 |
+
|
616 |
+
#print("Rect:", rect)
|
617 |
+
|
618 |
+
rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
|
619 |
+
|
620 |
+
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
621 |
+
#page.add_redact_annot(rect)#rect_small_pixel_height)
|
622 |
+
pymupdf_page.add_redact_annot(rect_small_pixel_height)
|
623 |
+
|
624 |
+
# Set up drawing a black box over the whole rect
|
625 |
+
shape = pymupdf_page.new_shape()
|
626 |
+
shape.draw_rect(pymupdf_rect)
|
627 |
+
|
628 |
+
if custom_colours == True:
|
629 |
+
if img_annotation_box["color"][0] > 1:
|
630 |
+
out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
|
631 |
+
else:
|
632 |
+
out_colour = img_annotation_box["color"]
|
633 |
+
else:
|
634 |
+
out_colour = (0,0,0)
|
635 |
+
|
636 |
+
shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
|
637 |
+
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
638 |
+
shape.commit()
|
639 |
+
|
640 |
+
def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
|
641 |
|
642 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
643 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
|
|
728 |
|
729 |
all_image_annotation_boxes.append(img_annotation_box)
|
730 |
|
731 |
+
redact_single_box(page, rect, img_annotation_box, custom_colours)
|
|
|
|
|
|
|
732 |
|
733 |
+
# If whole page is to be redacted, do that here
|
734 |
+
if redact_whole_page == True:
|
735 |
+
# Small border to page that remains white
|
736 |
+
border = 5
|
737 |
+
# Define the coordinates for the Rect
|
738 |
+
whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
739 |
+
whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
|
740 |
|
741 |
+
whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
|
|
|
|
|
742 |
|
743 |
+
# Create new image annotation element based on whole page coordinates
|
744 |
+
whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
745 |
|
746 |
+
# Write whole page annotation to annotation boxes
|
747 |
+
whole_page_img_annotation_box = {}
|
748 |
+
whole_page_img_annotation_box["xmin"] = whole_page_image_x1
|
749 |
+
whole_page_img_annotation_box["ymin"] = whole_page_image_y1
|
750 |
+
whole_page_img_annotation_box["xmax"] = whole_page_image_x2
|
751 |
+
whole_page_img_annotation_box["ymax"] = whole_page_image_y2
|
752 |
+
whole_page_img_annotation_box["color"] = (0,0,0)
|
753 |
+
whole_page_img_annotation_box["label"] = "Whole page"
|
754 |
|
755 |
+
redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
|
|
|
|
|
|
|
|
|
|
|
756 |
|
757 |
+
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
|
|
|
|
758 |
|
759 |
out_annotation_boxes = {
|
760 |
"image": image_path, #Image.open(image_path), #image_path,
|
761 |
"boxes": all_image_annotation_boxes
|
762 |
}
|
763 |
|
764 |
+
|
765 |
+
|
766 |
+
|
767 |
page.apply_redactions(images=0, graphics=0)
|
768 |
page.clean_contents()
|
769 |
|
|
|
774 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
775 |
box1[1] < box2[3] and box2[1] < box1[3])
|
776 |
|
777 |
+
from collections import defaultdict
|
778 |
+
from typing import List, Dict
|
779 |
+
import copy
|
780 |
+
|
781 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
782 |
+
|
783 |
+
all_bboxes = []
|
784 |
merged_bboxes = []
|
785 |
grouped_bboxes = defaultdict(list)
|
786 |
|
787 |
+
# Deep copy original bounding boxes to retain them
|
788 |
+
original_bboxes = copy.deepcopy(bboxes)
|
789 |
+
|
790 |
+
# Process signature and handwriting results
|
791 |
if signature_recogniser_results or handwriting_recogniser_results:
|
792 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
793 |
+
merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
|
|
|
794 |
|
795 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
796 |
+
merged_bboxes.extend(copy.deepcopy(signature_recogniser_results))
|
|
|
|
|
797 |
|
798 |
# Reconstruct bounding boxes for substrings of interest
|
799 |
reconstructed_bboxes = []
|
800 |
for bbox in bboxes:
|
|
|
801 |
bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
802 |
for line_text, line_info in combined_results.items():
|
803 |
line_box = line_info['bounding_box']
|
804 |
+
if bounding_boxes_overlap(bbox_box, line_box):
|
805 |
if bbox.text in line_text:
|
806 |
start_char = line_text.index(bbox.text)
|
807 |
end_char = start_char + len(bbox.text)
|
808 |
+
|
809 |
relevant_words = []
|
810 |
current_char = 0
|
811 |
for word in line_info['words']:
|
|
|
819 |
current_char += 1 # +1 for space if the word doesn't already end with a space
|
820 |
|
821 |
if relevant_words:
|
|
|
822 |
left = min(word['bounding_box'][0] for word in relevant_words)
|
823 |
top = min(word['bounding_box'][1] for word in relevant_words)
|
824 |
right = max(word['bounding_box'][2] for word in relevant_words)
|
825 |
bottom = max(word['bounding_box'][3] for word in relevant_words)
|
826 |
+
|
|
|
827 |
combined_text = " ".join(word['text'] for word in relevant_words)
|
828 |
|
|
|
829 |
reconstructed_bbox = CustomImageRecognizerResult(
|
830 |
bbox.entity_type,
|
831 |
bbox.start,
|
|
|
834 |
left,
|
835 |
top,
|
836 |
right - left, # width
|
837 |
+
bottom - top, # height,
|
838 |
combined_text
|
839 |
)
|
840 |
+
#reconstructed_bboxes.append(bbox) # Add original bbox
|
841 |
+
reconstructed_bboxes.append(reconstructed_bbox) # Add merged bbox
|
842 |
break
|
843 |
else:
|
|
|
844 |
reconstructed_bboxes.append(bbox)
|
845 |
|
846 |
# Group reconstructed bboxes by approximate vertical proximity
|
|
|
854 |
merged_box = group[0]
|
855 |
for next_box in group[1:]:
|
856 |
if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
857 |
+
new_text = merged_box.text + " " + next_box.text
|
858 |
+
new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
859 |
|
860 |
new_left = min(merged_box.left, next_box.left)
|
861 |
new_top = min(merged_box.top, next_box.top)
|
862 |
new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
|
863 |
new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
|
864 |
+
|
865 |
merged_box = CustomImageRecognizerResult(
|
866 |
new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
|
867 |
)
|
868 |
else:
|
869 |
merged_bboxes.append(merged_box)
|
870 |
+
merged_box = next_box
|
871 |
|
872 |
merged_bboxes.append(merged_box)
|
873 |
|
874 |
+
all_bboxes.extend(original_bboxes)
|
875 |
+
#all_bboxes.extend(reconstructed_bboxes)
|
876 |
+
all_bboxes.extend(merged_bboxes)
|
877 |
+
|
878 |
+
# Return the unique original and merged bounding boxes
|
879 |
+
unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
|
880 |
+
return unique_bboxes
|
881 |
+
|
882 |
+
|
883 |
+
# def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
884 |
+
# merged_bboxes = []
|
885 |
+
# grouped_bboxes = defaultdict(list)
|
886 |
+
|
887 |
+
# # Process signature and handwriting results
|
888 |
+
# if signature_recogniser_results or handwriting_recogniser_results:
|
889 |
+
# if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
890 |
+
# #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
891 |
+
# merged_bboxes.extend(handwriting_recogniser_results)
|
892 |
+
|
893 |
+
# if "Redact all identified signatures" in handwrite_signature_checkbox:
|
894 |
+
# #print("Signature boxes exist at merge:", signature_recogniser_results)
|
895 |
+
# merged_bboxes.extend(signature_recogniser_results)
|
896 |
+
|
897 |
+
|
898 |
+
# # Reconstruct bounding boxes for substrings of interest
|
899 |
+
# reconstructed_bboxes = []
|
900 |
+
# for bbox in bboxes:
|
901 |
+
# #print("bbox:", bbox)
|
902 |
+
# bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
903 |
+
# for line_text, line_info in combined_results.items():
|
904 |
+
# line_box = line_info['bounding_box']
|
905 |
+
# if bounding_boxes_overlap(bbox_box, line_box):
|
906 |
+
# if bbox.text in line_text:
|
907 |
+
# start_char = line_text.index(bbox.text)
|
908 |
+
# end_char = start_char + len(bbox.text)
|
909 |
+
|
910 |
+
# relevant_words = []
|
911 |
+
# current_char = 0
|
912 |
+
# for word in line_info['words']:
|
913 |
+
# word_end = current_char + len(word['text'])
|
914 |
+
# if current_char <= start_char < word_end or current_char < end_char <= word_end or (start_char <= current_char and word_end <= end_char):
|
915 |
+
# relevant_words.append(word)
|
916 |
+
# if word_end >= end_char:
|
917 |
+
# break
|
918 |
+
# current_char = word_end
|
919 |
+
# if not word['text'].endswith(' '):
|
920 |
+
# current_char += 1 # +1 for space if the word doesn't already end with a space
|
921 |
+
|
922 |
+
# if relevant_words:
|
923 |
+
# #print("Relevant words:", relevant_words)
|
924 |
+
# left = min(word['bounding_box'][0] for word in relevant_words)
|
925 |
+
# top = min(word['bounding_box'][1] for word in relevant_words)
|
926 |
+
# right = max(word['bounding_box'][2] for word in relevant_words)
|
927 |
+
# bottom = max(word['bounding_box'][3] for word in relevant_words)
|
928 |
+
|
929 |
+
# # Combine the text of all relevant words
|
930 |
+
# combined_text = " ".join(word['text'] for word in relevant_words)
|
931 |
+
|
932 |
+
# # Calculate new dimensions for the merged box
|
933 |
+
# reconstructed_bbox = CustomImageRecognizerResult(
|
934 |
+
# bbox.entity_type,
|
935 |
+
# bbox.start,
|
936 |
+
# bbox.end,
|
937 |
+
# bbox.score,
|
938 |
+
# left,
|
939 |
+
# top,
|
940 |
+
# right - left, # width
|
941 |
+
# bottom - top, # height
|
942 |
+
# combined_text
|
943 |
+
# )
|
944 |
+
# # Add both the original and the merged bounding box
|
945 |
+
# reconstructed_bboxes.append(bbox) # Retain the original bbox
|
946 |
+
# reconstructed_bboxes.append(reconstructed_bbox) # Add the merged bbox
|
947 |
+
# break
|
948 |
+
# else:
|
949 |
+
# # If the bbox text is not found in any line in combined_results, keep the original bbox
|
950 |
+
# reconstructed_bboxes.append(bbox)
|
951 |
+
|
952 |
+
# # Group reconstructed bboxes by approximate vertical proximity
|
953 |
+
# for box in reconstructed_bboxes:
|
954 |
+
# grouped_bboxes[round(box.top / vertical_threshold)].append(box)
|
955 |
+
|
956 |
+
# # Merge within each group
|
957 |
+
# for _, group in grouped_bboxes.items():
|
958 |
+
# group.sort(key=lambda box: box.left)
|
959 |
+
|
960 |
+
# merged_box = group[0]
|
961 |
+
# for next_box in group[1:]:
|
962 |
+
# if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
963 |
+
# # Calculate new dimensions for the merged box
|
964 |
+
# if merged_box.text == next_box.text:
|
965 |
+
# new_text = merged_box.text
|
966 |
+
# else:
|
967 |
+
# new_text = merged_box.text + " " + next_box.text
|
968 |
+
|
969 |
+
# if merged_box.text == next_box.text:
|
970 |
+
# new_text = merged_box.text
|
971 |
+
# new_entity_type = merged_box.entity_type # Keep the original entity type
|
972 |
+
# else:
|
973 |
+
# new_text = merged_box.text + " " + next_box.text
|
974 |
+
# new_entity_type = merged_box.entity_type + " - " + next_box.entity_type # Concatenate entity types
|
975 |
+
|
976 |
+
# new_left = min(merged_box.left, next_box.left)
|
977 |
+
# new_top = min(merged_box.top, next_box.top)
|
978 |
+
# new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
|
979 |
+
# new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
|
980 |
+
# merged_box = CustomImageRecognizerResult(
|
981 |
+
# new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
|
982 |
+
# )
|
983 |
+
# else:
|
984 |
+
# merged_bboxes.append(merged_box)
|
985 |
+
# merged_box = next_box
|
986 |
+
|
987 |
+
# merged_bboxes.append(merged_box)
|
988 |
+
|
989 |
+
# #print("bboxes:", bboxes)
|
990 |
+
|
991 |
+
# return merged_bboxes
|
992 |
|
993 |
def redact_image_pdf(file_path:str,
|
994 |
prepared_pdf_file_paths:List[str],
|
|
|
1015 |
custom_recogniser_word_list:List[str]=[],
|
1016 |
redact_whole_page_list:List[str]=[],
|
1017 |
page_break_val:int=int(page_break_value),
|
1018 |
+
log_files_output_paths:List=[],
|
1019 |
max_time:int=int(max_time_value),
|
1020 |
progress=Progress(track_tqdm=True)):
|
1021 |
|
|
|
1047 |
- custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
|
1048 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
1049 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
1050 |
+
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
1051 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
1052 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
1053 |
|
|
|
1070 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
1071 |
print("Connection to AWS Comprehend service unsuccessful.")
|
1072 |
|
1073 |
+
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1074 |
|
1075 |
if analysis_type == textract_option and textract_client == "":
|
1076 |
print("Connection to AWS Textract service unsuccessful.")
|
1077 |
|
1078 |
+
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1079 |
|
1080 |
tic = time.perf_counter()
|
1081 |
|
|
|
1106 |
if analysis_type == textract_option:
|
1107 |
|
1108 |
json_file_path = output_folder + file_name + "_textract.json"
|
1109 |
+
log_files_output_paths.append(json_file_path)
|
1110 |
|
1111 |
if not os.path.exists(json_file_path):
|
1112 |
no_textract_file = True
|
1113 |
print("No existing Textract results file found.")
|
1114 |
existing_data = {}
|
1115 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1116 |
+
#log_files_output_paths.append(json_file_path)
|
1117 |
#request_metadata = request_metadata + "\n" + new_request_metadata
|
1118 |
#wrapped_text_blocks = {"pages":[text_blocks]}
|
1119 |
else:
|
|
|
1184 |
|
1185 |
if not existing_data:
|
1186 |
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1187 |
+
log_files_output_paths.append(json_file_path)
|
1188 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
1189 |
|
1190 |
existing_data = {"pages":[text_blocks]}
|
|
|
1212 |
|
1213 |
# if not os.path.exists(json_file_path):
|
1214 |
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1215 |
+
# log_files_output_paths.append(json_file_path)
|
1216 |
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1217 |
|
1218 |
# existing_data = {"pages":[text_blocks]}
|
|
|
1242 |
# with open(json_file_path, 'w') as json_file:
|
1243 |
# json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1244 |
|
1245 |
+
# log_files_output_paths.append(json_file_path)
|
1246 |
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1247 |
# else:
|
1248 |
# # If the page exists, retrieve the data
|
|
|
1373 |
|
1374 |
current_loop_page += 1
|
1375 |
|
1376 |
+
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1377 |
|
1378 |
if is_pdf(file_path) == False:
|
1379 |
images.append(image)
|
|
|
1394 |
with open(json_file_path, 'w') as json_file:
|
1395 |
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1396 |
|
1397 |
+
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1398 |
|
1399 |
if analysis_type == textract_option:
|
1400 |
# Write the updated existing textract data back to the JSON file
|
|
|
1402 |
with open(json_file_path, 'w') as json_file:
|
1403 |
json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1404 |
|
1405 |
+
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1406 |
|
1407 |
|
1408 |
###
|
|
|
1518 |
|
1519 |
return line_level_results_out, line_level_characters_out # Return both results and character objects
|
1520 |
|
1521 |
+
def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
|
1522 |
'''
|
1523 |
Merge identified bounding boxes containing PII that are very close to one another
|
1524 |
'''
|
1525 |
analysed_bounding_boxes = []
|
1526 |
+
original_bounding_boxes = [] # List to hold original bounding boxes
|
1527 |
+
|
1528 |
if len(analyser_results) > 0 and len(characters) > 0:
|
1529 |
# Extract bounding box coordinates for sorting
|
1530 |
bounding_boxes = []
|
|
|
1531 |
for result in analyser_results:
|
1532 |
+
#print("Result:", result)
|
1533 |
char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1534 |
char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1535 |
if char_boxes:
|
|
|
1538 |
bottom = min(box[1] for box in char_boxes)
|
1539 |
right = max(box[2] for box in char_boxes)
|
1540 |
top = max(box[3] for box in char_boxes) + vertical_padding
|
1541 |
+
bbox = [left, bottom, right, top]
|
1542 |
+
bounding_boxes.append((bottom, left, result, bbox, char_text)) # (y, x, result, bbox, text)
|
1543 |
|
1544 |
+
# Store original bounding boxes
|
1545 |
+
original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
|
1546 |
+
#print("Original bounding boxes:", original_bounding_boxes)
|
1547 |
|
1548 |
# Sort the results by y-coordinate and then by x-coordinate
|
1549 |
bounding_boxes.sort()
|
|
|
1554 |
current_result = None
|
1555 |
current_text = []
|
1556 |
|
1557 |
+
for y, x, result, next_box, text in bounding_boxes:
|
|
|
|
|
|
|
1558 |
if current_y is None or current_box is None:
|
1559 |
+
# Initialize the first bounding box
|
1560 |
+
current_box = next_box
|
1561 |
+
current_y = next_box[1]
|
1562 |
current_result = result
|
1563 |
current_text = list(text)
|
|
|
1564 |
else:
|
1565 |
+
vertical_diff_bboxes = abs(next_box[1] - current_y)
|
1566 |
+
horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
|
1567 |
+
|
1568 |
+
if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
|
1569 |
+
# Merge bounding boxes
|
1570 |
+
#print("Merging boxes")
|
1571 |
+
merged_box = current_box.copy()
|
1572 |
+
merged_result = current_result
|
1573 |
+
merged_text = current_text.copy()
|
1574 |
+
|
1575 |
+
#print("current_box_max_x:", current_box[2])
|
1576 |
+
#print("char_max_x:", next_box[2])
|
1577 |
+
|
1578 |
+
merged_box[2] = next_box[2] # Extend horizontally
|
1579 |
+
merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
|
1580 |
+
merged_result.end = max(current_result.end, result.end) # Extend text range
|
1581 |
try:
|
1582 |
+
merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
1583 |
except Exception as e:
|
1584 |
+
print("Unable to combine result entity types:", e)
|
|
|
|
|
1585 |
if current_text:
|
1586 |
+
merged_text.append(" ") # Add space between texts
|
1587 |
+
merged_text.extend(text)
|
1588 |
+
|
1589 |
+
merged_bounding_boxes.append({
|
1590 |
+
"text": "".join(merged_text),
|
1591 |
+
"boundingBox": merged_box,
|
1592 |
+
"result": merged_result
|
1593 |
+
})
|
1594 |
|
|
|
1595 |
else:
|
1596 |
+
# Save the current merged box before starting a new one
|
1597 |
+
# merged_bounding_boxes.append({
|
1598 |
+
# "text": "".join(current_text),
|
1599 |
+
# "boundingBox": current_box,
|
1600 |
+
# "result": current_result
|
1601 |
+
# })
|
1602 |
+
# Start a new bounding box
|
1603 |
+
current_box = next_box
|
1604 |
+
current_y = next_box[1]
|
1605 |
current_result = result
|
1606 |
current_text = list(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1607 |
|
1608 |
+
# Handle the last box
|
1609 |
+
# if current_box is not None:
|
1610 |
+
# merged_bounding_boxes.append({
|
1611 |
+
# "text": "".join(current_text),
|
1612 |
+
# "boundingBox": current_box,
|
1613 |
+
# "result": current_result
|
1614 |
+
# })
|
1615 |
+
|
1616 |
+
# Combine original and merged bounding boxes
|
1617 |
+
analysed_bounding_boxes.extend(original_bounding_boxes)
|
1618 |
+
analysed_bounding_boxes.extend(merged_bounding_boxes)
|
1619 |
+
|
1620 |
+
#print("Analysed bounding boxes:", analysed_bounding_boxes)
|
1621 |
+
|
1622 |
return analysed_bounding_boxes
|
1623 |
|
1624 |
|
1625 |
+
# def merge_text_bounding_boxes(analyser_results, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
|
1626 |
+
# '''
|
1627 |
+
# Merge identified bounding boxes containing PII that are very close to one another
|
1628 |
+
# '''
|
1629 |
+
# analysed_bounding_boxes = []
|
1630 |
+
# if len(analyser_results) > 0 and len(characters) > 0:
|
1631 |
+
# # Extract bounding box coordinates for sorting
|
1632 |
+
# bounding_boxes = []
|
1633 |
+
# text_out = []
|
1634 |
+
# for result in analyser_results:
|
1635 |
+
# char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1636 |
+
# char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
1637 |
+
# if char_boxes:
|
1638 |
+
# # Calculate the bounding box that encompasses all characters
|
1639 |
+
# left = min(box[0] for box in char_boxes)
|
1640 |
+
# bottom = min(box[1] for box in char_boxes)
|
1641 |
+
# right = max(box[2] for box in char_boxes)
|
1642 |
+
# top = max(box[3] for box in char_boxes) + vertical_padding
|
1643 |
+
# bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text)) # (y, x, result, bbox, text)
|
1644 |
+
|
1645 |
+
# char_text = "".join(char_text)
|
1646 |
+
|
1647 |
+
# # Sort the results by y-coordinate and then by x-coordinate
|
1648 |
+
# bounding_boxes.sort()
|
1649 |
+
|
1650 |
+
# merged_bounding_boxes = []
|
1651 |
+
# current_box = None
|
1652 |
+
# current_y = None
|
1653 |
+
# current_result = None
|
1654 |
+
# current_text = []
|
1655 |
+
|
1656 |
+
# for y, x, result, char_box, text in bounding_boxes:
|
1657 |
+
# #print(f"Considering result: {result}")
|
1658 |
+
# #print(f"Character box: {char_box}")
|
1659 |
+
|
1660 |
+
# if current_y is None or current_box is None:
|
1661 |
+
# current_box = char_box
|
1662 |
+
# current_y = char_box[1]
|
1663 |
+
# current_result = result
|
1664 |
+
# current_text = list(text)
|
1665 |
+
# #print(f"Starting new box: {current_box}")
|
1666 |
+
# else:
|
1667 |
+
# vertical_diff_bboxes = abs(char_box[1] - current_y)
|
1668 |
+
# horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
1669 |
+
|
1670 |
+
# if (
|
1671 |
+
# vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
|
1672 |
+
# ):
|
1673 |
+
# #print("box is being extended")
|
1674 |
+
# current_box[2] = char_box[2] # Extend the current box horizontally
|
1675 |
+
# current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
1676 |
+
# current_result.end = max(current_result.end, result.end) # Extend the text range
|
1677 |
+
# try:
|
1678 |
+
# current_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
1679 |
+
# except Exception as e:
|
1680 |
+
# print("Unable to combine result entity types:")
|
1681 |
+
# print(e)
|
1682 |
+
# # Add a space if current_text is not empty
|
1683 |
+
# if current_text:
|
1684 |
+
# current_text.append(" ") # Add space between texts
|
1685 |
+
# current_text.extend(text)
|
1686 |
+
|
1687 |
+
# #print(f"Latest merged box: {current_box[-1]}")
|
1688 |
+
# else:
|
1689 |
+
# merged_bounding_boxes.append(
|
1690 |
+
# {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
|
1691 |
+
|
1692 |
+
# # Reset current_box and current_y after appending
|
1693 |
+
# current_box = char_box
|
1694 |
+
# current_y = char_box[1]
|
1695 |
+
# current_result = result
|
1696 |
+
# current_text = list(text)
|
1697 |
+
|
1698 |
+
# # After finishing with the current result, add the last box for this result
|
1699 |
+
# if current_box:
|
1700 |
+
# merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
|
1701 |
+
|
1702 |
+
# if not merged_bounding_boxes:
|
1703 |
+
# analysed_bounding_boxes.extend(
|
1704 |
+
# {"text":text, "boundingBox": char.bbox, "result": result}
|
1705 |
+
# for result in analyser_results
|
1706 |
+
# for char in characters[result.start:result.end]
|
1707 |
+
# if isinstance(char, LTChar)
|
1708 |
+
# )
|
1709 |
+
# else:
|
1710 |
+
# analysed_bounding_boxes.extend(merged_bounding_boxes)
|
1711 |
+
|
1712 |
+
# return analysed_bounding_boxes
|
1713 |
+
|
1714 |
|
1715 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
1716 |
decision_process_table = pd.DataFrame()
|
tools/helper_functions.py
CHANGED
@@ -3,6 +3,7 @@ import re
|
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import unicodedata
|
|
|
6 |
from gradio_image_annotation import image_annotator
|
7 |
|
8 |
def reset_state_vars():
|
@@ -38,13 +39,11 @@ textract_option = "AWS Textract service - all PDF types"
|
|
38 |
local_pii_detector = "Local"
|
39 |
aws_pii_detector = "AWS Comprehend"
|
40 |
|
|
|
|
|
41 |
|
42 |
-
|
43 |
-
|
44 |
-
default_value = 'output/'
|
45 |
-
|
46 |
-
output_folder = get_or_create_env_var(env_var_name, default_value)
|
47 |
-
print(f'The value of {env_var_name} is {output_folder}')
|
48 |
|
49 |
def load_in_default_allow_list(allow_list_file_path):
|
50 |
if isinstance(allow_list_file_path, str):
|
@@ -105,7 +104,7 @@ def ensure_output_folder_exists():
|
|
105 |
else:
|
106 |
print(f"The 'output/' folder already exists.")
|
107 |
|
108 |
-
def custom_regex_load(in_file):
|
109 |
'''
|
110 |
When file is loaded, update the column dropdown choices and write to relevant data states.
|
111 |
'''
|
@@ -113,6 +112,7 @@ def custom_regex_load(in_file):
|
|
113 |
custom_regex = pd.DataFrame()
|
114 |
|
115 |
if in_file:
|
|
|
116 |
|
117 |
file_list = [string.name for string in in_file]
|
118 |
|
@@ -122,13 +122,13 @@ def custom_regex_load(in_file):
|
|
122 |
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
123 |
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
124 |
|
125 |
-
output_text = "
|
|
|
126 |
print(output_text)
|
127 |
else:
|
128 |
-
|
129 |
-
print(
|
130 |
-
output_text
|
131 |
-
return error, custom_regex
|
132 |
|
133 |
return output_text, custom_regex
|
134 |
|
|
|
3 |
import gradio as gr
|
4 |
import pandas as pd
|
5 |
import unicodedata
|
6 |
+
from typing import List
|
7 |
from gradio_image_annotation import image_annotator
|
8 |
|
9 |
def reset_state_vars():
|
|
|
39 |
local_pii_detector = "Local"
|
40 |
aws_pii_detector = "AWS Comprehend"
|
41 |
|
42 |
+
output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
|
43 |
+
print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
44 |
|
45 |
+
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
46 |
+
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
|
|
|
|
|
|
|
|
47 |
|
48 |
def load_in_default_allow_list(allow_list_file_path):
|
49 |
if isinstance(allow_list_file_path, str):
|
|
|
104 |
else:
|
105 |
print(f"The 'output/' folder already exists.")
|
106 |
|
107 |
+
def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
|
108 |
'''
|
109 |
When file is loaded, update the column dropdown choices and write to relevant data states.
|
110 |
'''
|
|
|
112 |
custom_regex = pd.DataFrame()
|
113 |
|
114 |
if in_file:
|
115 |
+
print("File type:", file_type)
|
116 |
|
117 |
file_list = [string.name for string in in_file]
|
118 |
|
|
|
122 |
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
123 |
#regex_file_name_no_ext = get_file_path_end(regex_file_name)
|
124 |
|
125 |
+
output_text = file_type + " file loaded."
|
126 |
+
|
127 |
print(output_text)
|
128 |
else:
|
129 |
+
output_text = "No file provided."
|
130 |
+
print(output_text)
|
131 |
+
return output_text, custom_regex
|
|
|
132 |
|
133 |
return output_text, custom_regex
|
134 |
|
tools/redaction_review.py
CHANGED
@@ -1,13 +1,15 @@
|
|
1 |
import gradio as gr
|
|
|
2 |
import numpy as np
|
3 |
from typing import List
|
4 |
from gradio_image_annotation import image_annotator
|
5 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
6 |
|
7 |
-
from tools.file_conversion import is_pdf,
|
8 |
from tools.helper_functions import get_file_path_end, output_folder
|
9 |
from tools.file_redaction import redact_page_with_pymupdf
|
10 |
import json
|
|
|
11 |
import pymupdf
|
12 |
from fitz import Document
|
13 |
from PIL import ImageDraw, Image
|
@@ -138,13 +140,14 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
138 |
|
139 |
return all_image_annotations, current_page, current_page
|
140 |
|
141 |
-
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
|
142 |
'''
|
143 |
-
Apply modified redactions to a pymupdf
|
144 |
'''
|
145 |
#print("all_image_annotations:", all_image_annotations)
|
146 |
|
147 |
output_files = []
|
|
|
148 |
|
149 |
image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
|
150 |
|
@@ -154,86 +157,100 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
154 |
print("No image annotations found")
|
155 |
return doc, all_image_annotations
|
156 |
|
157 |
-
if isinstance(file_paths,
|
158 |
-
|
159 |
-
else:
|
160 |
-
file_path = file_paths
|
161 |
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
|
|
|
|
|
|
168 |
|
169 |
-
|
170 |
|
171 |
-
|
172 |
-
# image = Image.open(image_annotated['image'])
|
173 |
-
# except:
|
174 |
-
# image = Image.fromarray(image_annotated['image'].astype('uint8'))
|
175 |
|
176 |
-
|
|
|
|
|
|
|
|
|
177 |
|
178 |
-
|
179 |
-
coords = [img_annotation_box["xmin"],
|
180 |
-
img_annotation_box["ymin"],
|
181 |
-
img_annotation_box["xmax"],
|
182 |
-
img_annotation_box["ymax"]]
|
183 |
|
184 |
-
|
185 |
|
186 |
-
|
187 |
|
188 |
-
|
189 |
|
190 |
-
|
|
|
|
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
|
238 |
def crop(annotations:AnnotatedImageData):
|
239 |
if annotations["boxes"]:
|
@@ -246,3 +263,21 @@ def crop(annotations:AnnotatedImageData):
|
|
246 |
|
247 |
def get_boxes_json(annotations:AnnotatedImageData):
|
248 |
return annotations["boxes"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
import numpy as np
|
4 |
from typing import List
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
7 |
|
8 |
+
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
|
9 |
from tools.helper_functions import get_file_path_end, output_folder
|
10 |
from tools.file_redaction import redact_page_with_pymupdf
|
11 |
import json
|
12 |
+
import os
|
13 |
import pymupdf
|
14 |
from fitz import Document
|
15 |
from PIL import ImageDraw, Image
|
|
|
140 |
|
141 |
return all_image_annotations, current_page, current_page
|
142 |
|
143 |
+
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
|
144 |
'''
|
145 |
+
Apply modified redactions to a pymupdf and export review files
|
146 |
'''
|
147 |
#print("all_image_annotations:", all_image_annotations)
|
148 |
|
149 |
output_files = []
|
150 |
+
output_log_files = []
|
151 |
|
152 |
image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
|
153 |
|
|
|
157 |
print("No image annotations found")
|
158 |
return doc, all_image_annotations
|
159 |
|
160 |
+
if isinstance(file_paths, str):
|
161 |
+
file_paths = [file_paths]
|
|
|
|
|
162 |
|
163 |
+
for file_path in file_paths:
|
164 |
+
print("file_path:", file_path)
|
165 |
+
file_base = get_file_path_end(file_path)
|
166 |
+
|
167 |
+
file_extension = os.path.splitext(file_path)[1].lower()
|
168 |
+
|
169 |
+
# If working with image docs
|
170 |
+
if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
|
171 |
+
image = Image.open(file_paths[-1])
|
172 |
|
173 |
+
#image = pdf_doc
|
174 |
|
175 |
+
draw = ImageDraw.Draw(image)
|
|
|
|
|
|
|
176 |
|
177 |
+
for img_annotation_box in image_annotated['boxes']:
|
178 |
+
coords = [img_annotation_box["xmin"],
|
179 |
+
img_annotation_box["ymin"],
|
180 |
+
img_annotation_box["xmax"],
|
181 |
+
img_annotation_box["ymax"]]
|
182 |
|
183 |
+
fill = img_annotation_box["color"]
|
|
|
|
|
|
|
|
|
184 |
|
185 |
+
draw.rectangle(coords, fill=fill)
|
186 |
|
187 |
+
image.save(output_folder + file_base + "_redacted.png")
|
188 |
|
189 |
+
doc = [image]
|
190 |
|
191 |
+
elif file_extension in '.csv':
|
192 |
+
print("This is a csv")
|
193 |
+
pdf_doc = []
|
194 |
|
195 |
+
# If working with pdfs
|
196 |
+
elif is_pdf(file_path) == True:
|
197 |
+
pdf_doc = pymupdf.open(file_path)
|
198 |
+
|
199 |
+
number_of_pages = pdf_doc.page_count
|
200 |
+
|
201 |
+
print("Saving pages to file.")
|
202 |
+
|
203 |
+
for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
|
204 |
+
|
205 |
+
#print("Saving page", str(i))
|
206 |
+
|
207 |
+
image_loc = all_image_annotations[i]['image']
|
208 |
+
#print("Image location:", image_loc)
|
209 |
+
|
210 |
+
# Load in image object
|
211 |
+
if isinstance(image_loc, np.ndarray):
|
212 |
+
image = Image.fromarray(image_loc.astype('uint8'))
|
213 |
+
#all_image_annotations[i]['image'] = image_loc.tolist()
|
214 |
+
elif isinstance(image_loc, Image.Image):
|
215 |
+
image = image_loc
|
216 |
+
#image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
|
217 |
+
#image_loc.save(image_out_folder)
|
218 |
+
#all_image_annotations[i]['image'] = image_out_folder
|
219 |
+
elif isinstance(image_loc, str):
|
220 |
+
image = Image.open(image_loc)
|
221 |
+
|
222 |
+
pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
|
223 |
+
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
224 |
+
|
225 |
+
else:
|
226 |
+
print("File type not recognised.")
|
227 |
+
|
228 |
+
#try:
|
229 |
+
if pdf_doc:
|
230 |
+
out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
|
231 |
+
pdf_doc.save(out_pdf_file_path)
|
232 |
+
output_files.append(out_pdf_file_path)
|
233 |
+
|
234 |
+
try:
|
235 |
+
# print("Saving annotations to JSON")
|
236 |
+
|
237 |
+
out_annotation_file_path = output_folder + file_base + '_review_file.json'
|
238 |
+
with open(out_annotation_file_path, 'w') as f:
|
239 |
+
json.dump(all_image_annotations, f)
|
240 |
+
output_log_files.append(out_annotation_file_path)
|
241 |
+
|
242 |
+
print("Saving annotations to CSV review file")
|
243 |
+
|
244 |
+
# Convert json to csv and also save this
|
245 |
+
review_df = convert_review_json_to_pandas_df(all_image_annotations)
|
246 |
+
out_review_file_file_path = output_folder + file_base + '_review_file.csv'
|
247 |
+
review_df.to_csv(out_review_file_file_path, index=None)
|
248 |
+
output_files.append(out_review_file_file_path)
|
249 |
+
|
250 |
+
except Exception as e:
|
251 |
+
print("Could not save annotations to json file:", e)
|
252 |
+
|
253 |
+
return doc, all_image_annotations, output_files, output_log_files
|
254 |
|
255 |
def crop(annotations:AnnotatedImageData):
|
256 |
if annotations["boxes"]:
|
|
|
263 |
|
264 |
def get_boxes_json(annotations:AnnotatedImageData):
|
265 |
return annotations["boxes"]
|
266 |
+
# Group the DataFrame by the 'image' column
|
267 |
+
grouped = df.groupby('image')
|
268 |
+
|
269 |
+
# Create a list to hold the JSON data
|
270 |
+
json_data = []
|
271 |
+
|
272 |
+
# Iterate over each group
|
273 |
+
for image_path, group in grouped:
|
274 |
+
# Convert each group to a list of box dictionaries
|
275 |
+
boxes = group.drop(columns='image').to_dict(orient='records')
|
276 |
+
|
277 |
+
# Append the structured data to the json_data list
|
278 |
+
json_data.append({
|
279 |
+
"image": image_path,
|
280 |
+
"boxes": boxes
|
281 |
+
})
|
282 |
+
|
283 |
+
return json_data
|