seanpedrickcase commited on
Commit
a770956
·
1 Parent(s): 928b1e9

Enhance file handling and UI features: improved Gradio app layout with fill width option, and integrated new settings for deny, and fully redacted lists (placeholders so far). Updated file conversion functions to handle CSV inputs and added CSV review file generation for redactions. Now retains all original and merged redaction boxes.

Browse files
.dockerignore CHANGED
@@ -6,6 +6,7 @@
6
  *.ipynb
7
  examples/*
8
  processing/*
 
9
  output/*
10
  tools/__pycache__/*
11
  old_code/*
 
6
  *.ipynb
7
  examples/*
8
  processing/*
9
+ input/*
10
  output/*
11
  tools/__pycache__/*
12
  old_code/*
.gitignore CHANGED
@@ -6,6 +6,7 @@
6
  *.ipynb
7
  examples/*
8
  processing/*
 
9
  output/*
10
  tools/__pycache__/*
11
  old_code/*
 
6
  *.ipynb
7
  examples/*
8
  processing/*
9
+ input/*
10
  output/*
11
  tools/__pycache__/*
12
  old_code/*
Dockerfile CHANGED
@@ -52,6 +52,7 @@ RUN useradd -m -u 1000 user
52
 
53
  # Create required directories
54
  RUN mkdir -p /home/user/app/output \
 
55
  && mkdir -p /home/user/app/tld \
56
  && mkdir -p /home/user/app/logs \
57
  && chown -R user:user /home/user/app
 
52
 
53
  # Create required directories
54
  RUN mkdir -p /home/user/app/output \
55
+ && mkdir -p /home/user/app/input \
56
  && mkdir -p /home/user/app/tld \
57
  && mkdir -p /home/user/app/logs \
58
  && chown -R user:user /home/user/app
app.py CHANGED
@@ -54,7 +54,7 @@ else:
54
  default_pii_detector = local_pii_detector
55
 
56
  # Create the gradio interface
57
- app = gr.Blocks(theme = gr.themes.Base())
58
 
59
  with app:
60
 
@@ -67,7 +67,7 @@ with app:
67
  all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
68
  all_decision_process_table_state = gr.State(pd.DataFrame())
69
 
70
- in_allow_list_state = gr.State(pd.DataFrame())
71
 
72
  session_hash_state = gr.State()
73
  s3_output_folder_state = gr.State()
@@ -106,15 +106,7 @@ with app:
106
  estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
107
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
108
 
109
- s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
110
-
111
- ## S3 default bucket and allow list file state
112
- default_allow_list_file_name = "default_allow_list.csv"
113
- default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
114
-
115
- s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
116
- s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
117
- default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
118
 
119
 
120
  ## Annotator zoom value
@@ -125,6 +117,25 @@ with app:
125
  clear_all_page_redactions = gr.State(True)
126
  prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
  ###
130
  # UI DESIGN
@@ -172,6 +183,10 @@ with app:
172
  # Object annotation
173
  with gr.Tab("Review redactions", id="tab_object_annotation"):
174
 
 
 
 
 
175
  with gr.Row():
176
  annotation_last_page_button = gr.Button("Previous page", scale = 3)
177
  annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
@@ -203,9 +218,7 @@ with app:
203
  annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
204
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
205
 
206
- output_review_files = gr.File(label="Review output files", file_count='multiple')
207
- upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...redactions.json)")
208
-
209
  # TEXT / TABULAR DATA TAB
210
  with gr.Tab(label="Open text or Excel/csv files"):
211
  gr.Markdown(
@@ -236,8 +249,6 @@ with app:
236
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
237
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
238
 
239
-
240
-
241
  # SETTINGS TAB
242
  with gr.Tab(label="Redaction settings"):
243
  gr.Markdown(
@@ -250,14 +261,18 @@ with app:
250
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
251
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
252
 
253
-
254
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
255
  with gr.Row():
256
- in_allow_list = gr.File(label="Import allow list file", file_count="multiple")
257
- with gr.Column():
258
- gr.Markdown("""Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.""")
259
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
260
-
 
 
 
 
 
 
261
  with gr.Accordion("Add or remove entity types to redact", open = False):
262
  in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
263
 
@@ -266,15 +281,11 @@ with app:
266
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
267
  #with gr.Row():
268
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
269
-
270
 
271
  with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
272
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
273
 
274
- log_files_output = gr.File(label="Log file output", interactive=False)
275
-
276
- # If a custom allow list is uploaded
277
- in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
278
 
279
  ###
280
  # PDF/IMAGE REDACTION
@@ -283,25 +294,22 @@ with app:
283
 
284
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
285
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
286
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
287
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
288
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
289
 
290
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
291
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
292
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
293
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
294
 
295
  # If a file has been completed, the function will continue onto the next document
296
  latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
297
  then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
298
- # latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
299
- # then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
300
- # outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state]).\
301
- #then(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page]).\
302
- #then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
303
 
304
- ### REVIEW REDACTIONS
 
 
305
 
306
  # Page controls at top
307
  annotate_current_page.submit(
@@ -326,7 +334,7 @@ with app:
326
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
327
 
328
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
329
- annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
330
 
331
  # Page controls at bottom
332
  annotate_current_page_bottom.submit(
@@ -355,6 +363,16 @@ with app:
355
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
356
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
357
 
 
 
 
 
 
 
 
 
 
 
358
  ###
359
  # APP LOAD AND LOGGING
360
  ###
 
54
  default_pii_detector = local_pii_detector
55
 
56
  # Create the gradio interface
57
+ app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
58
 
59
  with app:
60
 
 
67
  all_line_level_ocr_results_df_state = gr.State(pd.DataFrame())
68
  all_decision_process_table_state = gr.State(pd.DataFrame())
69
 
70
+
71
 
72
  session_hash_state = gr.State()
73
  s3_output_folder_state = gr.State()
 
106
  estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
107
  annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
108
 
109
+ s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
 
 
 
 
 
 
 
 
110
 
111
 
112
  ## Annotator zoom value
 
117
  clear_all_page_redactions = gr.State(True)
118
  prepare_for_review_bool = gr.Checkbox(value=True, visible=False)
119
 
120
+ ## Settings page variables
121
+ default_allow_list_file_name = "default_allow_list.csv"
122
+ default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
123
+ in_allow_list_state = gr.State(pd.DataFrame())
124
+
125
+ default_deny_list_file_name = "default_deny_list.csv"
126
+ default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
127
+ in_deny_list_state = gr.State(pd.DataFrame())
128
+ in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
129
+
130
+ fully_redacted_list_file_name = "default_fully_redacted_list.csv"
131
+ fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
132
+ in_fully_redacted_list_state = gr.State(pd.DataFrame())
133
+ in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
134
+
135
+ # S3 settings for default allow list load
136
+ s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
137
+ s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
138
+ default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
139
 
140
  ###
141
  # UI DESIGN
 
183
  # Object annotation
184
  with gr.Tab("Review redactions", id="tab_object_annotation"):
185
 
186
+ with gr.Accordion(label = "Review previous redactions", open=True):
187
+ output_review_files = gr.File(label="Review output files", file_count='multiple')
188
+ upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)")
189
+
190
  with gr.Row():
191
  annotation_last_page_button = gr.Button("Previous page", scale = 3)
192
  annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
 
218
  annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
219
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
220
 
221
+
 
 
222
  # TEXT / TABULAR DATA TAB
223
  with gr.Tab(label="Open text or Excel/csv files"):
224
  gr.Markdown(
 
249
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
250
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
251
 
 
 
252
  # SETTINGS TAB
253
  with gr.Tab(label="Redaction settings"):
254
  gr.Markdown(
 
261
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
262
  page_max = gr.Number(precision=0,minimum=0,maximum=9999, label="Highest page to redact")
263
 
 
264
  with gr.Accordion("Settings for documents and open text/xlsx/csv files", open = True):
265
  with gr.Row():
266
+ with gr.Column():
267
+ in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will not be redacted.", file_count="multiple", height=50)
 
268
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
269
+ with gr.Column():
270
+ in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case sensitive). Terms in this file will always be redacted.", file_count="multiple", height=50)
271
+ in_deny_list_text = gr.Textbox(label="Custom deny list load status")
272
+ with gr.Column():
273
+ in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=50)
274
+ in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
275
+
276
  with gr.Accordion("Add or remove entity types to redact", open = False):
277
  in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
278
 
 
281
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
282
  #with gr.Row():
283
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
 
284
 
285
  with gr.Accordion("Settings for open text or xlsx/csv files", open = True):
286
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
287
 
288
+ log_files_output = gr.File(label="Log file output", interactive=False)
 
 
 
289
 
290
  ###
291
  # PDF/IMAGE REDACTION
 
294
 
295
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
296
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
297
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
298
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
299
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
300
 
301
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
302
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
303
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
304
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
305
 
306
  # If a file has been completed, the function will continue onto the next document
307
  latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page]).\
308
  then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
 
 
 
 
 
309
 
310
+ ###
311
+ # REVIEW PDF REDACTIONS
312
+ ###
313
 
314
  # Page controls at top
315
  annotate_current_page.submit(
 
334
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page])
335
 
336
  #annotation_button_get.click(get_boxes_json, annotator, json_boxes)
337
+ annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
338
 
339
  # Page controls at bottom
340
  annotate_current_page_bottom.submit(
 
363
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
364
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
365
 
366
+ ###
367
+ # SETTINGS PAGE INPUT / OUTPUT
368
+ ###
369
+ # If a custom allow list is uploaded
370
+ in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
371
+ in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
372
+ in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
373
+
374
+
375
+
376
  ###
377
  # APP LOAD AND LOGGING
378
  ###
tools/file_conversion.py CHANGED
@@ -1,13 +1,13 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
  import os
6
  import re
7
- import gradio as gr
8
  import time
9
  import json
10
  import pymupdf
 
11
  from tqdm import tqdm
12
  from gradio import Progress
13
  from typing import List, Optional
@@ -48,10 +48,15 @@ def is_pdf(filename):
48
 
49
 
50
 
51
- def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple[int, str]:
52
  try:
53
- out_path = f"{pdf_path}_{page_num}.png"
 
 
 
 
54
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
 
55
  if os.path.exists(out_path):
56
  print(f"Loading existing image for page {page_num + 1}")
57
  image = Image.open(out_path)
@@ -67,7 +72,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float) -> tuple
67
  print(f"Error processing page {page_num + 1}: {e}")
68
  return page_num, None
69
 
70
- def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8):
71
 
72
  # If preparing for review, just load the first page
73
  if prepare_for_review == True:
@@ -252,6 +257,7 @@ def prepare_image_or_pdf(
252
  """
253
 
254
  tic = time.perf_counter()
 
255
 
256
  # If this is the first time around, set variables to 0/blank
257
  if first_loop_state==True:
@@ -341,10 +347,15 @@ def prepare_image_or_pdf(
341
  if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
342
  in_redact_method = tesseract_ocr_option
343
 
 
 
 
 
 
344
  # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
345
- if file_path.endswith(".json"):
346
 
347
- if prepare_for_review == True:
348
  print("Preparing file for review")
349
  if isinstance(file_path, str):
350
  with open(file_path, 'r') as json_file:
@@ -353,6 +364,20 @@ def prepare_image_or_pdf(
353
  # Assuming file_path is a NamedString or similar
354
  all_annotations_object = json.loads(file_path) # Use loads for string content
355
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
356
  # Get list of page numbers
357
  image_file_paths_pages = [
358
  int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
@@ -380,19 +405,11 @@ def prepare_image_or_pdf(
380
  #print("all_annotations_object:", all_annotations_object)
381
 
382
  # Write the response to a JSON file in output folder
383
- out_folder = output_folder + file_path_without_ext + file_extension
384
  with open(out_folder, 'w') as json_file:
385
  json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
386
  continue
387
-
388
- else:
389
- # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
390
- json_contents = json.load(file_path)
391
- # Write the response to a JSON file in output folder
392
- out_folder = output_folder + file_path_without_ext + file_extension
393
- with open(out_folder, 'w') as json_file:
394
- json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
395
- continue
396
 
397
  # Must be a pdf or image at this point
398
  else:
@@ -428,7 +445,6 @@ def prepare_image_or_pdf(
428
  page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
429
  page.insert_image(rect, filename=file_path) # Insert the image into the page
430
 
431
-
432
  toc = time.perf_counter()
433
  out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
434
 
@@ -467,3 +483,55 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
467
  #print("Out file paths:", out_file_paths)
468
 
469
  return out_message, out_file_paths
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, read_file
3
  from PIL import Image, ImageFile
4
  ImageFile.LOAD_TRUNCATED_IMAGES = True
5
  import os
6
  import re
 
7
  import time
8
  import json
9
  import pymupdf
10
+ import pandas as pd
11
  from tqdm import tqdm
12
  from gradio import Progress
13
  from typing import List, Optional
 
48
 
49
 
50
 
51
+ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
52
  try:
53
+ # Construct the full output directory path relative to the current working directory
54
+ output_dir = os.path.join(os.getcwd(), output_dir)
55
+
56
+ # Use the output_dir to construct the out_path
57
+ out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
58
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
59
+
60
  if os.path.exists(out_path):
61
  print(f"Loading existing image for page {page_num + 1}")
62
  image = Image.open(out_path)
 
72
  print(f"Error processing page {page_num + 1}: {e}")
73
  return page_num, None
74
 
75
+ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = 200, num_threads: int = 8, output_dir: str = '/input'):
76
 
77
  # If preparing for review, just load the first page
78
  if prepare_for_review == True:
 
257
  """
258
 
259
  tic = time.perf_counter()
260
+ json_from_csv = False
261
 
262
  # If this is the first time around, set variables to 0/blank
263
  if first_loop_state==True:
 
347
  if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
348
  in_redact_method = tesseract_ocr_option
349
 
350
+ if file_extension in ['.csv']:
351
+ review_file_csv = read_file(file)
352
+ all_annotations_object = convert_pandas_df_to_review_json(review_file_csv)
353
+ json_from_csv = True
354
+
355
  # If the file name ends with redactions.json, assume it is an annoations object, overwrite the current variable
356
+ if (file_extension in ['.json']) | (json_from_csv == True):
357
 
358
+ if (file_extension in ['.json']) & (prepare_for_review == True):
359
  print("Preparing file for review")
360
  if isinstance(file_path, str):
361
  with open(file_path, 'r') as json_file:
 
364
  # Assuming file_path is a NamedString or similar
365
  all_annotations_object = json.loads(file_path) # Use loads for string content
366
 
367
+ # Assume it's a textract json
368
+ elif (file_extension in ['.json']) & (prepare_for_review != True):
369
+ # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
370
+ json_contents = json.load(file_path)
371
+ # Write the response to a JSON file in output folder
372
+ out_folder = output_folder + file_path_without_ext + ".json"
373
+ with open(out_folder, 'w') as json_file:
374
+ json.dump(json_contents, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
375
+ continue
376
+
377
+ # If you have an annotations object from the above code
378
+ if all_annotations_object:
379
+ #print("out_annotations_object found:", all_annotations_object)
380
+
381
  # Get list of page numbers
382
  image_file_paths_pages = [
383
  int(re.search(r'_(\d+)\.png$', os.path.basename(s)).group(1))
 
405
  #print("all_annotations_object:", all_annotations_object)
406
 
407
  # Write the response to a JSON file in output folder
408
+ out_folder = output_folder + file_path_without_ext + ".json"
409
  with open(out_folder, 'w') as json_file:
410
  json.dump(all_annotations_object, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
411
  continue
412
+
 
 
 
 
 
 
 
 
413
 
414
  # Must be a pdf or image at this point
415
  else:
 
445
  page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
446
  page.insert_image(rect, filename=file_path) # Insert the image into the page
447
 
 
448
  toc = time.perf_counter()
449
  out_time = f"File '{file_path_without_ext}' prepared in {toc - tic:0.1f} seconds."
450
 
 
483
  #print("Out file paths:", out_file_paths)
484
 
485
  return out_message, out_file_paths
486
+
487
+
488
+ def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
489
+ # Flatten the data
490
+ flattened_data = []
491
+
492
+ for entry in data:
493
+ #print("entry:", entry)
494
+ #print("flattened_data:", flattened_data)
495
+ image_path = entry["image"]
496
+
497
+ # Use regex to find the number before .png
498
+ match = re.search(r'_(\d+)\.png$', image_path)
499
+ if match:
500
+ number = match.group(1) # Extract the number
501
+ print(number) # Output: 0
502
+ reported_number = int(number) + 1
503
+ else:
504
+ print("No number found before .png")
505
+
506
+ for box in entry["boxes"]:
507
+ data_to_add = {"image": image_path, "page":reported_number, **box}
508
+ #print("data_to_add:", data_to_add)
509
+ flattened_data.append(data_to_add)
510
+
511
+ # Convert to a DataFrame
512
+ df = pd.DataFrame(flattened_data)
513
+
514
+ return df
515
+
516
+ def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
517
+ # Keep only necessary columns
518
+ df = df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
519
+
520
+ # Group the DataFrame by the 'image' column
521
+ grouped = df.groupby('image')
522
+
523
+ # Create a list to hold the JSON data
524
+ json_data = []
525
+
526
+ # Iterate over each group
527
+ for image_path, group in grouped:
528
+ # Convert each group to a list of box dictionaries
529
+ boxes = group.drop(columns=['image', 'page']).to_dict(orient='records')
530
+
531
+ # Append the structured data to the json_data list
532
+ json_data.append({
533
+ "image": image_path,
534
+ "boxes": boxes
535
+ })
536
+
537
+ return json_data
tools/file_redaction.py CHANGED
@@ -4,6 +4,7 @@ import json
4
  import io
5
  import os
6
  import boto3
 
7
 
8
  from tqdm import tqdm
9
  from PIL import Image, ImageChops, ImageFile, ImageDraw
@@ -25,7 +26,7 @@ from collections import defaultdict # For efficient grouping
25
  from presidio_analyzer import RecognizerResult
26
  from tools.aws_functions import RUN_AWS_FUNCTIONS
27
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
28
- from tools.file_conversion import process_file, image_dpi
29
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
30
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
31
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
@@ -68,6 +69,8 @@ def choose_and_run_redactor(file_paths:List[str],
68
  chosen_redact_comprehend_entities:List[str],
69
  in_redact_method:str,
70
  in_allow_list:List[List[str]]=None,
 
 
71
  latest_file_completed:int=0,
72
  out_message:list=[],
73
  out_file_paths:list=[],
@@ -99,6 +102,8 @@ def choose_and_run_redactor(file_paths:List[str],
99
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
100
  - in_redact_method (str): The method to use for redaction.
101
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
 
 
102
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
103
  - out_message (list, optional): A list to store output messages. Defaults to an empty list.
104
  - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
@@ -188,7 +193,7 @@ def choose_and_run_redactor(file_paths:List[str],
188
 
189
  if not in_allow_list.empty:
190
  in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
191
- print("In allow list:", in_allow_list_flat)
192
  else:
193
  in_allow_list_flat = []
194
 
@@ -236,7 +241,7 @@ def choose_and_run_redactor(file_paths:List[str],
236
  file_paths_list = file_paths
237
  file_paths_loop = [file_paths_list[int(latest_file_completed)]]
238
 
239
- print("file_paths_list in choose_redactor function:", file_paths_list)
240
 
241
 
242
  for file in file_paths_loop:
@@ -269,7 +274,7 @@ def choose_and_run_redactor(file_paths:List[str],
269
 
270
  print("Redacting file " + file_path_without_ext + " as an image-based file")
271
 
272
- pymupdf_doc,all_decision_process_table,logging_file_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
273
  prepared_pdf_image_paths,
274
  language,
275
  chosen_redact_entities,
@@ -300,7 +305,7 @@ def choose_and_run_redactor(file_paths:List[str],
300
 
301
  elif in_redact_method == text_ocr_option:
302
 
303
- logging_file_paths = ""
304
 
305
  if is_pdf(file_path) == False:
306
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
@@ -353,12 +358,12 @@ def choose_and_run_redactor(file_paths:List[str],
353
 
354
  out_file_paths.append(out_image_file_path)
355
 
356
- if logging_file_paths:
357
- log_files_output_paths.extend(logging_file_paths)
358
 
359
  logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
360
  all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
361
- out_file_paths.append(logs_output_file_name)
362
 
363
  all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
364
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
@@ -366,12 +371,23 @@ def choose_and_run_redactor(file_paths:List[str],
366
 
367
  # Save the gradio_annotation_boxes to a JSON file
368
  try:
369
- out_annotation_file_path = out_image_file_path + '_redactions.json'
 
 
370
  with open(out_annotation_file_path, 'w') as f:
371
  json.dump(annotations_all_pages, f)
372
- out_file_paths.append(out_annotation_file_path)
373
- except:
374
- print("Could not save annotations to json file.")
 
 
 
 
 
 
 
 
 
375
 
376
  # Make a combined message for the file
377
  if isinstance(out_message, list):
@@ -578,7 +594,50 @@ def move_page_info(file_path: str) -> str:
578
 
579
  return new_file_path
580
 
581
- def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None, custom_colours=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
582
 
583
  mediabox_height = page.mediabox[3] - page.mediabox[1]
584
  mediabox_width = page.mediabox[2] - page.mediabox[0]
@@ -669,40 +728,42 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None, custo
669
 
670
  all_image_annotation_boxes.append(img_annotation_box)
671
 
672
- # Calculate the middle y value and set a small height (not used)
673
- #print("Rect:", rect)
674
- #middle_y = (pymupdf_y1 + pymupdf_y2) / 2
675
- rect_small_pixel_height = Rect(pymupdf_x1, pymupdf_y1 + 2, pymupdf_x2, pymupdf_y2 - 2) # Slightly smaller than outside box
676
 
677
- # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
678
- #page.add_redact_annot(rect)#rect_small_pixel_height)
679
- page.add_redact_annot(rect_small_pixel_height)
 
 
 
 
680
 
681
- # Set up drawing a black box over the whole rect
682
- shape = page.new_shape()
683
- shape.draw_rect(rect)
684
 
685
- if custom_colours == True:
 
686
 
687
- def convert_color_to_range_0_1(color):
688
- return tuple(component / 255 for component in color)
 
 
 
 
 
 
689
 
690
- if img_annotation_box["color"][0] > 1:
691
- out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
692
- else:
693
- out_colour = img_annotation_box["color"]
694
- else:
695
- out_colour = (0,0,0)
696
 
697
- shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
698
- #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
699
- shape.commit()
700
 
701
  out_annotation_boxes = {
702
  "image": image_path, #Image.open(image_path), #image_path,
703
  "boxes": all_image_annotation_boxes
704
  }
705
 
 
 
 
706
  page.apply_redactions(images=0, graphics=0)
707
  page.clean_contents()
708
 
@@ -713,33 +774,38 @@ def bounding_boxes_overlap(box1, box2):
713
  return (box1[0] < box2[2] and box2[0] < box1[2] and
714
  box1[1] < box2[3] and box2[1] < box1[3])
715
 
 
 
 
 
716
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
 
 
717
  merged_bboxes = []
718
  grouped_bboxes = defaultdict(list)
719
 
720
- # Process signature and handwriting results
 
 
 
721
  if signature_recogniser_results or handwriting_recogniser_results:
722
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
723
- #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
724
- merged_bboxes.extend(handwriting_recogniser_results)
725
 
726
  if "Redact all identified signatures" in handwrite_signature_checkbox:
727
- #print("Signature boxes exist at merge:", signature_recogniser_results)
728
- merged_bboxes.extend(signature_recogniser_results)
729
-
730
 
731
  # Reconstruct bounding boxes for substrings of interest
732
  reconstructed_bboxes = []
733
  for bbox in bboxes:
734
- #print("bbox:", bbox)
735
  bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
736
  for line_text, line_info in combined_results.items():
737
  line_box = line_info['bounding_box']
738
- if bounding_boxes_overlap(bbox_box, line_box):
739
  if bbox.text in line_text:
740
  start_char = line_text.index(bbox.text)
741
  end_char = start_char + len(bbox.text)
742
-
743
  relevant_words = []
744
  current_char = 0
745
  for word in line_info['words']:
@@ -753,16 +819,13 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
753
  current_char += 1 # +1 for space if the word doesn't already end with a space
754
 
755
  if relevant_words:
756
- #print("Relevant words:", relevant_words)
757
  left = min(word['bounding_box'][0] for word in relevant_words)
758
  top = min(word['bounding_box'][1] for word in relevant_words)
759
  right = max(word['bounding_box'][2] for word in relevant_words)
760
  bottom = max(word['bounding_box'][3] for word in relevant_words)
761
-
762
- # Combine the text of all relevant words
763
  combined_text = " ".join(word['text'] for word in relevant_words)
764
 
765
- # Calculate new dimensions for the merged box
766
  reconstructed_bbox = CustomImageRecognizerResult(
767
  bbox.entity_type,
768
  bbox.start,
@@ -771,13 +834,13 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
771
  left,
772
  top,
773
  right - left, # width
774
- bottom - top, # height
775
  combined_text
776
  )
777
- reconstructed_bboxes.append(reconstructed_bbox)
 
778
  break
779
  else:
780
- # If the bbox text is not found in any line in combined_results, keep the original bbox
781
  reconstructed_bboxes.append(bbox)
782
 
783
  # Group reconstructed bboxes by approximate vertical proximity
@@ -791,35 +854,141 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
791
  merged_box = group[0]
792
  for next_box in group[1:]:
793
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
794
- # Calculate new dimensions for the merged box
795
- if merged_box.text == next_box.text:
796
- new_text = merged_box.text
797
- else:
798
- new_text = merged_box.text + " " + next_box.text
799
-
800
- if merged_box.text == next_box.text:
801
- new_text = merged_box.text
802
- new_entity_type = merged_box.entity_type # Keep the original entity type
803
- else:
804
- new_text = merged_box.text + " " + next_box.text
805
- new_entity_type = merged_box.entity_type + " - " + next_box.entity_type # Concatenate entity types
806
 
807
  new_left = min(merged_box.left, next_box.left)
808
  new_top = min(merged_box.top, next_box.top)
809
  new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
810
  new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
 
811
  merged_box = CustomImageRecognizerResult(
812
  new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
813
  )
814
  else:
815
  merged_bboxes.append(merged_box)
816
- merged_box = next_box
817
 
818
  merged_bboxes.append(merged_box)
819
 
820
- #print("bboxes:", bboxes)
821
-
822
- return merged_bboxes
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
823
 
824
  def redact_image_pdf(file_path:str,
825
  prepared_pdf_file_paths:List[str],
@@ -846,7 +1015,7 @@ def redact_image_pdf(file_path:str,
846
  custom_recogniser_word_list:List[str]=[],
847
  redact_whole_page_list:List[str]=[],
848
  page_break_val:int=int(page_break_value),
849
- logging_file_paths:List=[],
850
  max_time:int=int(max_time_value),
851
  progress=Progress(track_tqdm=True)):
852
 
@@ -878,7 +1047,7 @@ def redact_image_pdf(file_path:str,
878
  - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
879
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
880
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
881
- - logging_file_paths (List, optional): List of file paths used for saving redaction process logging results.
882
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
883
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
884
 
@@ -901,12 +1070,12 @@ def redact_image_pdf(file_path:str,
901
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
902
  print("Connection to AWS Comprehend service unsuccessful.")
903
 
904
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
905
 
906
  if analysis_type == textract_option and textract_client == "":
907
  print("Connection to AWS Textract service unsuccessful.")
908
 
909
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
910
 
911
  tic = time.perf_counter()
912
 
@@ -937,14 +1106,14 @@ def redact_image_pdf(file_path:str,
937
  if analysis_type == textract_option:
938
 
939
  json_file_path = output_folder + file_name + "_textract.json"
940
- logging_file_paths.append(json_file_path)
941
 
942
  if not os.path.exists(json_file_path):
943
  no_textract_file = True
944
  print("No existing Textract results file found.")
945
  existing_data = {}
946
  #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
947
- #logging_file_paths.append(json_file_path)
948
  #request_metadata = request_metadata + "\n" + new_request_metadata
949
  #wrapped_text_blocks = {"pages":[text_blocks]}
950
  else:
@@ -1015,7 +1184,7 @@ def redact_image_pdf(file_path:str,
1015
 
1016
  if not existing_data:
1017
  text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1018
- logging_file_paths.append(json_file_path)
1019
  request_metadata = request_metadata + "\n" + new_request_metadata
1020
 
1021
  existing_data = {"pages":[text_blocks]}
@@ -1043,7 +1212,7 @@ def redact_image_pdf(file_path:str,
1043
 
1044
  # if not os.path.exists(json_file_path):
1045
  # text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1046
- # logging_file_paths.append(json_file_path)
1047
  # request_metadata = request_metadata + "\n" + new_request_metadata
1048
 
1049
  # existing_data = {"pages":[text_blocks]}
@@ -1073,7 +1242,7 @@ def redact_image_pdf(file_path:str,
1073
  # with open(json_file_path, 'w') as json_file:
1074
  # json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1075
 
1076
- # logging_file_paths.append(json_file_path)
1077
  # request_metadata = request_metadata + "\n" + new_request_metadata
1078
  # else:
1079
  # # If the page exists, retrieve the data
@@ -1204,7 +1373,7 @@ def redact_image_pdf(file_path:str,
1204
 
1205
  current_loop_page += 1
1206
 
1207
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1208
 
1209
  if is_pdf(file_path) == False:
1210
  images.append(image)
@@ -1225,7 +1394,7 @@ def redact_image_pdf(file_path:str,
1225
  with open(json_file_path, 'w') as json_file:
1226
  json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1227
 
1228
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1229
 
1230
  if analysis_type == textract_option:
1231
  # Write the updated existing textract data back to the JSON file
@@ -1233,7 +1402,7 @@ def redact_image_pdf(file_path:str,
1233
  with open(json_file_path, 'w') as json_file:
1234
  json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1235
 
1236
- return pymupdf_doc, all_decision_process_table, logging_file_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1237
 
1238
 
1239
  ###
@@ -1349,16 +1518,18 @@ def create_text_bounding_boxes_from_characters(char_objects:List[LTChar]) -> Tup
1349
 
1350
  return line_level_results_out, line_level_characters_out # Return both results and character objects
1351
 
1352
- def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
1353
  '''
1354
  Merge identified bounding boxes containing PII that are very close to one another
1355
  '''
1356
  analysed_bounding_boxes = []
 
 
1357
  if len(analyser_results) > 0 and len(characters) > 0:
1358
  # Extract bounding box coordinates for sorting
1359
  bounding_boxes = []
1360
- text_out = []
1361
  for result in analyser_results:
 
1362
  char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1363
  char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1364
  if char_boxes:
@@ -1367,9 +1538,12 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
1367
  bottom = min(box[1] for box in char_boxes)
1368
  right = max(box[2] for box in char_boxes)
1369
  top = max(box[3] for box in char_boxes) + vertical_padding
1370
- bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text)) # (y, x, result, bbox, text)
 
1371
 
1372
- char_text = "".join(char_text)
 
 
1373
 
1374
  # Sort the results by y-coordinate and then by x-coordinate
1375
  bounding_boxes.sort()
@@ -1380,74 +1554,163 @@ def merge_text_bounding_boxes(analyser_results:CustomImageRecognizerResult, char
1380
  current_result = None
1381
  current_text = []
1382
 
1383
- for y, x, result, char_box, text in bounding_boxes:
1384
- #print(f"Considering result: {result}")
1385
- #print(f"Character box: {char_box}")
1386
-
1387
  if current_y is None or current_box is None:
1388
- current_box = char_box
1389
- current_y = char_box[1]
 
1390
  current_result = result
1391
  current_text = list(text)
1392
- #print(f"Starting new box: {current_box}")
1393
  else:
1394
- vertical_diff_bboxes = abs(char_box[1] - current_y)
1395
- horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
1396
-
1397
- #print(f"Comparing boxes: current_box={current_box}, char_box={char_box}, current_text={current_text}, char_text={text}")
1398
- #print(f"Vertical diff: {vertical_diff_bboxes}, Horizontal diff: {horizontal_diff_bboxes}")
1399
-
1400
- if (
1401
- vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
1402
- ):
1403
- #print("box is being extended")
1404
- current_box[2] = char_box[2] # Extend the current box horizontally
1405
- current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
1406
- current_result.end = max(current_result.end, result.end) # Extend the text range
 
 
 
1407
  try:
1408
- current_result.entity_type = current_result.entity_type + " - " + result.entity_type
1409
  except Exception as e:
1410
- print("Unable to combine result entity types:")
1411
- print(e)
1412
- # Add a space if current_text is not empty
1413
  if current_text:
1414
- current_text.append(" ") # Add space between texts
1415
- current_text.extend(text)
 
 
 
 
 
 
1416
 
1417
- #print(f"Latest merged box: {current_box[-1]}")
1418
  else:
1419
- merged_bounding_boxes.append(
1420
- {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
1421
- #print(f"Appending merged box: {current_box}")
1422
- #print(f"Latest merged box: {merged_bounding_boxes[-1]}")
1423
-
1424
- # Reset current_box and current_y after appending
1425
- current_box = char_box
1426
- current_y = char_box[1]
 
1427
  current_result = result
1428
  current_text = list(text)
1429
- #print(f"Starting new box: {current_box}")
1430
-
1431
- # After finishing with the current result, add the last box for this result
1432
- if current_box:
1433
- merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
1434
- #print(f"Appending final box for result: {current_box}")
1435
-
1436
- if not merged_bounding_boxes:
1437
- analysed_bounding_boxes.extend(
1438
- {"text":text, "boundingBox": char.bbox, "result": result}
1439
- for result in analyser_results
1440
- for char in characters[result.start:result.end]
1441
- if isinstance(char, LTChar)
1442
- )
1443
- else:
1444
- analysed_bounding_boxes.extend(merged_bounding_boxes)
1445
 
1446
- #print("Analyzed bounding boxes:\n\n", analysed_bounding_boxes)
1447
-
 
 
 
 
 
 
 
 
 
 
 
 
1448
  return analysed_bounding_boxes
1449
 
1450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1451
 
1452
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1453
  decision_process_table = pd.DataFrame()
 
4
  import io
5
  import os
6
  import boto3
7
+ import copy
8
 
9
  from tqdm import tqdm
10
  from PIL import Image, ImageChops, ImageFile, ImageDraw
 
26
  from presidio_analyzer import RecognizerResult
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
29
+ from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
31
  from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 
69
  chosen_redact_comprehend_entities:List[str],
70
  in_redact_method:str,
71
  in_allow_list:List[List[str]]=None,
72
+ in_deny_list:List[List[str]]=None,
73
+ in_fully_redacted_list:List[List[str]]=None,
74
  latest_file_completed:int=0,
75
  out_message:list=[],
76
  out_file_paths:list=[],
 
102
  - chosen_redact_comprehend_entities (List[str]): A list of entity types to redact from files, chosen from the official list from AWS Comprehend service
103
  - in_redact_method (str): The method to use for redaction.
104
  - in_allow_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
105
+ - in_deny_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
106
+ - in_fully_redacted_list (List[List[str]], optional): A list of allowed terms for redaction. Defaults to None.
107
  - latest_file_completed (int, optional): The index of the last completed file. Defaults to 0.
108
  - out_message (list, optional): A list to store output messages. Defaults to an empty list.
109
  - out_file_paths (list, optional): A list to store paths to the output files. Defaults to an empty list.
 
193
 
194
  if not in_allow_list.empty:
195
  in_allow_list_flat = in_allow_list.iloc[:,0].tolist()
196
+ #print("In allow list:", in_allow_list_flat)
197
  else:
198
  in_allow_list_flat = []
199
 
 
241
  file_paths_list = file_paths
242
  file_paths_loop = [file_paths_list[int(latest_file_completed)]]
243
 
244
+ # print("file_paths_list in choose_redactor function:", file_paths_list)
245
 
246
 
247
  for file in file_paths_loop:
 
274
 
275
  print("Redacting file " + file_path_without_ext + " as an image-based file")
276
 
277
+ pymupdf_doc,all_decision_process_table,log_files_output_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
278
  prepared_pdf_image_paths,
279
  language,
280
  chosen_redact_entities,
 
305
 
306
  elif in_redact_method == text_ocr_option:
307
 
308
+ #log_files_output_paths = []
309
 
310
  if is_pdf(file_path) == False:
311
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
 
358
 
359
  out_file_paths.append(out_image_file_path)
360
 
361
+ #if log_files_output_paths:
362
+ # log_files_output_paths.extend(log_files_output_paths)
363
 
364
  logs_output_file_name = out_image_file_path + "_decision_process_output.csv"
365
  all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
366
+ log_files_output_paths.append(logs_output_file_name)
367
 
368
  all_text_output_file_name = out_image_file_path + "_ocr_output.csv"
369
  all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
 
371
 
372
  # Save the gradio_annotation_boxes to a JSON file
373
  try:
374
+ print("Saving annotations to JSON")
375
+
376
+ out_annotation_file_path = out_image_file_path + '_review_file.json'
377
  with open(out_annotation_file_path, 'w') as f:
378
  json.dump(annotations_all_pages, f)
379
+ log_files_output_paths.append(out_annotation_file_path)
380
+
381
+ print("Saving annotations to CSV")
382
+
383
+ # Convert json to csv and also save this
384
+ review_df = convert_review_json_to_pandas_df(annotations_all_pages)
385
+ out_review_file_file_path = out_image_file_path + '_review_file.csv'
386
+ review_df.to_csv(out_review_file_file_path, index=None)
387
+ out_file_paths.append(out_review_file_file_path)
388
+
389
+ except Exception as e:
390
+ print("Could not save annotations to json file:", e)
391
 
392
  # Make a combined message for the file
393
  if isinstance(out_message, list):
 
594
 
595
  return new_file_path
596
 
597
+ def convert_color_to_range_0_1(color):
598
+ return tuple(component / 255 for component in color)
599
+
600
+ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:dict, custom_colours:bool=False):
601
+ pymupdf_x1 = pymupdf_rect[0]
602
+ pymupdf_y1 = pymupdf_rect[1]
603
+ pymupdf_x2 = pymupdf_rect[2]
604
+ pymupdf_y2 = pymupdf_rect[3]
605
+
606
+ # Calculate area to actually remove text from the pdf (different from black box size)
607
+ redact_bottom_y = pymupdf_y1 + 2
608
+ redact_top_y = pymupdf_y2 - 2
609
+
610
+ # Calculate the middle y value and set a small height if default values are too close together
611
+ if (redact_top_y - redact_bottom_y) < 1:
612
+ middle_y = (pymupdf_y1 + pymupdf_y2) / 2
613
+ redact_bottom_y = middle_y - 1
614
+ redact_top_y = middle_y + 1
615
+
616
+ #print("Rect:", rect)
617
+
618
+ rect_small_pixel_height = Rect(pymupdf_x1, redact_bottom_y, pymupdf_x2, redact_top_y) # Slightly smaller than outside box
619
+
620
+ # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
621
+ #page.add_redact_annot(rect)#rect_small_pixel_height)
622
+ pymupdf_page.add_redact_annot(rect_small_pixel_height)
623
+
624
+ # Set up drawing a black box over the whole rect
625
+ shape = pymupdf_page.new_shape()
626
+ shape.draw_rect(pymupdf_rect)
627
+
628
+ if custom_colours == True:
629
+ if img_annotation_box["color"][0] > 1:
630
+ out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
631
+ else:
632
+ out_colour = img_annotation_box["color"]
633
+ else:
634
+ out_colour = (0,0,0)
635
+
636
+ shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
637
+ #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
638
+ shape.commit()
639
+
640
+ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Image=None, custom_colours:bool=False, redact_whole_page:bool=False):
641
 
642
  mediabox_height = page.mediabox[3] - page.mediabox[1]
643
  mediabox_width = page.mediabox[2] - page.mediabox[0]
 
728
 
729
  all_image_annotation_boxes.append(img_annotation_box)
730
 
731
+ redact_single_box(page, rect, img_annotation_box, custom_colours)
 
 
 
732
 
733
+ # If whole page is to be redacted, do that here
734
+ if redact_whole_page == True:
735
+ # Small border to page that remains white
736
+ border = 5
737
+ # Define the coordinates for the Rect
738
+ whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
739
+ whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
740
 
741
+ whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
 
 
742
 
743
+ # Create new image annotation element based on whole page coordinates
744
+ whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
745
 
746
+ # Write whole page annotation to annotation boxes
747
+ whole_page_img_annotation_box = {}
748
+ whole_page_img_annotation_box["xmin"] = whole_page_image_x1
749
+ whole_page_img_annotation_box["ymin"] = whole_page_image_y1
750
+ whole_page_img_annotation_box["xmax"] = whole_page_image_x2
751
+ whole_page_img_annotation_box["ymax"] = whole_page_image_y2
752
+ whole_page_img_annotation_box["color"] = (0,0,0)
753
+ whole_page_img_annotation_box["label"] = "Whole page"
754
 
755
+ redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
 
 
 
 
 
756
 
757
+ all_image_annotation_boxes.append(whole_page_img_annotation_box)
 
 
758
 
759
  out_annotation_boxes = {
760
  "image": image_path, #Image.open(image_path), #image_path,
761
  "boxes": all_image_annotation_boxes
762
  }
763
 
764
+
765
+
766
+
767
  page.apply_redactions(images=0, graphics=0)
768
  page.clean_contents()
769
 
 
774
  return (box1[0] < box2[2] and box2[0] < box1[2] and
775
  box1[1] < box2[3] and box2[1] < box1[3])
776
 
777
+ from collections import defaultdict
778
+ from typing import List, Dict
779
+ import copy
780
+
781
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
782
+
783
+ all_bboxes = []
784
  merged_bboxes = []
785
  grouped_bboxes = defaultdict(list)
786
 
787
+ # Deep copy original bounding boxes to retain them
788
+ original_bboxes = copy.deepcopy(bboxes)
789
+
790
+ # Process signature and handwriting results
791
  if signature_recogniser_results or handwriting_recogniser_results:
792
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
793
+ merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
 
794
 
795
  if "Redact all identified signatures" in handwrite_signature_checkbox:
796
+ merged_bboxes.extend(copy.deepcopy(signature_recogniser_results))
 
 
797
 
798
  # Reconstruct bounding boxes for substrings of interest
799
  reconstructed_bboxes = []
800
  for bbox in bboxes:
 
801
  bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
802
  for line_text, line_info in combined_results.items():
803
  line_box = line_info['bounding_box']
804
+ if bounding_boxes_overlap(bbox_box, line_box):
805
  if bbox.text in line_text:
806
  start_char = line_text.index(bbox.text)
807
  end_char = start_char + len(bbox.text)
808
+
809
  relevant_words = []
810
  current_char = 0
811
  for word in line_info['words']:
 
819
  current_char += 1 # +1 for space if the word doesn't already end with a space
820
 
821
  if relevant_words:
 
822
  left = min(word['bounding_box'][0] for word in relevant_words)
823
  top = min(word['bounding_box'][1] for word in relevant_words)
824
  right = max(word['bounding_box'][2] for word in relevant_words)
825
  bottom = max(word['bounding_box'][3] for word in relevant_words)
826
+
 
827
  combined_text = " ".join(word['text'] for word in relevant_words)
828
 
 
829
  reconstructed_bbox = CustomImageRecognizerResult(
830
  bbox.entity_type,
831
  bbox.start,
 
834
  left,
835
  top,
836
  right - left, # width
837
+ bottom - top, # height,
838
  combined_text
839
  )
840
+ #reconstructed_bboxes.append(bbox) # Add original bbox
841
+ reconstructed_bboxes.append(reconstructed_bbox) # Add merged bbox
842
  break
843
  else:
 
844
  reconstructed_bboxes.append(bbox)
845
 
846
  # Group reconstructed bboxes by approximate vertical proximity
 
854
  merged_box = group[0]
855
  for next_box in group[1:]:
856
  if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
857
+ new_text = merged_box.text + " " + next_box.text
858
+ new_entity_type = merged_box.entity_type + " - " + next_box.entity_type
 
 
 
 
 
 
 
 
 
 
859
 
860
  new_left = min(merged_box.left, next_box.left)
861
  new_top = min(merged_box.top, next_box.top)
862
  new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
863
  new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
864
+
865
  merged_box = CustomImageRecognizerResult(
866
  new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
867
  )
868
  else:
869
  merged_bboxes.append(merged_box)
870
+ merged_box = next_box
871
 
872
  merged_bboxes.append(merged_box)
873
 
874
+ all_bboxes.extend(original_bboxes)
875
+ #all_bboxes.extend(reconstructed_bboxes)
876
+ all_bboxes.extend(merged_bboxes)
877
+
878
+ # Return the unique original and merged bounding boxes
879
+ unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
880
+ return unique_bboxes
881
+
882
+
883
+ # def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
884
+ # merged_bboxes = []
885
+ # grouped_bboxes = defaultdict(list)
886
+
887
+ # # Process signature and handwriting results
888
+ # if signature_recogniser_results or handwriting_recogniser_results:
889
+ # if "Redact all identified handwriting" in handwrite_signature_checkbox:
890
+ # #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
891
+ # merged_bboxes.extend(handwriting_recogniser_results)
892
+
893
+ # if "Redact all identified signatures" in handwrite_signature_checkbox:
894
+ # #print("Signature boxes exist at merge:", signature_recogniser_results)
895
+ # merged_bboxes.extend(signature_recogniser_results)
896
+
897
+
898
+ # # Reconstruct bounding boxes for substrings of interest
899
+ # reconstructed_bboxes = []
900
+ # for bbox in bboxes:
901
+ # #print("bbox:", bbox)
902
+ # bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
903
+ # for line_text, line_info in combined_results.items():
904
+ # line_box = line_info['bounding_box']
905
+ # if bounding_boxes_overlap(bbox_box, line_box):
906
+ # if bbox.text in line_text:
907
+ # start_char = line_text.index(bbox.text)
908
+ # end_char = start_char + len(bbox.text)
909
+
910
+ # relevant_words = []
911
+ # current_char = 0
912
+ # for word in line_info['words']:
913
+ # word_end = current_char + len(word['text'])
914
+ # if current_char <= start_char < word_end or current_char < end_char <= word_end or (start_char <= current_char and word_end <= end_char):
915
+ # relevant_words.append(word)
916
+ # if word_end >= end_char:
917
+ # break
918
+ # current_char = word_end
919
+ # if not word['text'].endswith(' '):
920
+ # current_char += 1 # +1 for space if the word doesn't already end with a space
921
+
922
+ # if relevant_words:
923
+ # #print("Relevant words:", relevant_words)
924
+ # left = min(word['bounding_box'][0] for word in relevant_words)
925
+ # top = min(word['bounding_box'][1] for word in relevant_words)
926
+ # right = max(word['bounding_box'][2] for word in relevant_words)
927
+ # bottom = max(word['bounding_box'][3] for word in relevant_words)
928
+
929
+ # # Combine the text of all relevant words
930
+ # combined_text = " ".join(word['text'] for word in relevant_words)
931
+
932
+ # # Calculate new dimensions for the merged box
933
+ # reconstructed_bbox = CustomImageRecognizerResult(
934
+ # bbox.entity_type,
935
+ # bbox.start,
936
+ # bbox.end,
937
+ # bbox.score,
938
+ # left,
939
+ # top,
940
+ # right - left, # width
941
+ # bottom - top, # height
942
+ # combined_text
943
+ # )
944
+ # # Add both the original and the merged bounding box
945
+ # reconstructed_bboxes.append(bbox) # Retain the original bbox
946
+ # reconstructed_bboxes.append(reconstructed_bbox) # Add the merged bbox
947
+ # break
948
+ # else:
949
+ # # If the bbox text is not found in any line in combined_results, keep the original bbox
950
+ # reconstructed_bboxes.append(bbox)
951
+
952
+ # # Group reconstructed bboxes by approximate vertical proximity
953
+ # for box in reconstructed_bboxes:
954
+ # grouped_bboxes[round(box.top / vertical_threshold)].append(box)
955
+
956
+ # # Merge within each group
957
+ # for _, group in grouped_bboxes.items():
958
+ # group.sort(key=lambda box: box.left)
959
+
960
+ # merged_box = group[0]
961
+ # for next_box in group[1:]:
962
+ # if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
963
+ # # Calculate new dimensions for the merged box
964
+ # if merged_box.text == next_box.text:
965
+ # new_text = merged_box.text
966
+ # else:
967
+ # new_text = merged_box.text + " " + next_box.text
968
+
969
+ # if merged_box.text == next_box.text:
970
+ # new_text = merged_box.text
971
+ # new_entity_type = merged_box.entity_type # Keep the original entity type
972
+ # else:
973
+ # new_text = merged_box.text + " " + next_box.text
974
+ # new_entity_type = merged_box.entity_type + " - " + next_box.entity_type # Concatenate entity types
975
+
976
+ # new_left = min(merged_box.left, next_box.left)
977
+ # new_top = min(merged_box.top, next_box.top)
978
+ # new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
979
+ # new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
980
+ # merged_box = CustomImageRecognizerResult(
981
+ # new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
982
+ # )
983
+ # else:
984
+ # merged_bboxes.append(merged_box)
985
+ # merged_box = next_box
986
+
987
+ # merged_bboxes.append(merged_box)
988
+
989
+ # #print("bboxes:", bboxes)
990
+
991
+ # return merged_bboxes
992
 
993
  def redact_image_pdf(file_path:str,
994
  prepared_pdf_file_paths:List[str],
 
1015
  custom_recogniser_word_list:List[str]=[],
1016
  redact_whole_page_list:List[str]=[],
1017
  page_break_val:int=int(page_break_value),
1018
+ log_files_output_paths:List=[],
1019
  max_time:int=int(max_time_value),
1020
  progress=Progress(track_tqdm=True)):
1021
 
 
1047
  - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
1048
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
1049
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
1050
+ - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
1051
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1052
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
1053
 
 
1070
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1071
  print("Connection to AWS Comprehend service unsuccessful.")
1072
 
1073
+ return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1074
 
1075
  if analysis_type == textract_option and textract_client == "":
1076
  print("Connection to AWS Textract service unsuccessful.")
1077
 
1078
+ return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1079
 
1080
  tic = time.perf_counter()
1081
 
 
1106
  if analysis_type == textract_option:
1107
 
1108
  json_file_path = output_folder + file_name + "_textract.json"
1109
+ log_files_output_paths.append(json_file_path)
1110
 
1111
  if not os.path.exists(json_file_path):
1112
  no_textract_file = True
1113
  print("No existing Textract results file found.")
1114
  existing_data = {}
1115
  #text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1116
+ #log_files_output_paths.append(json_file_path)
1117
  #request_metadata = request_metadata + "\n" + new_request_metadata
1118
  #wrapped_text_blocks = {"pages":[text_blocks]}
1119
  else:
 
1184
 
1185
  if not existing_data:
1186
  text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1187
+ log_files_output_paths.append(json_file_path)
1188
  request_metadata = request_metadata + "\n" + new_request_metadata
1189
 
1190
  existing_data = {"pages":[text_blocks]}
 
1212
 
1213
  # if not os.path.exists(json_file_path):
1214
  # text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1215
+ # log_files_output_paths.append(json_file_path)
1216
  # request_metadata = request_metadata + "\n" + new_request_metadata
1217
 
1218
  # existing_data = {"pages":[text_blocks]}
 
1242
  # with open(json_file_path, 'w') as json_file:
1243
  # json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1244
 
1245
+ # log_files_output_paths.append(json_file_path)
1246
  # request_metadata = request_metadata + "\n" + new_request_metadata
1247
  # else:
1248
  # # If the page exists, retrieve the data
 
1373
 
1374
  current_loop_page += 1
1375
 
1376
+ return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1377
 
1378
  if is_pdf(file_path) == False:
1379
  images.append(image)
 
1394
  with open(json_file_path, 'w') as json_file:
1395
  json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1396
 
1397
+ return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1398
 
1399
  if analysis_type == textract_option:
1400
  # Write the updated existing textract data back to the JSON file
 
1402
  with open(json_file_path, 'w') as json_file:
1403
  json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1404
 
1405
+ return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1406
 
1407
 
1408
  ###
 
1518
 
1519
  return line_level_results_out, line_level_characters_out # Return both results and character objects
1520
 
1521
+ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
1522
  '''
1523
  Merge identified bounding boxes containing PII that are very close to one another
1524
  '''
1525
  analysed_bounding_boxes = []
1526
+ original_bounding_boxes = [] # List to hold original bounding boxes
1527
+
1528
  if len(analyser_results) > 0 and len(characters) > 0:
1529
  # Extract bounding box coordinates for sorting
1530
  bounding_boxes = []
 
1531
  for result in analyser_results:
1532
+ #print("Result:", result)
1533
  char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1534
  char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1535
  if char_boxes:
 
1538
  bottom = min(box[1] for box in char_boxes)
1539
  right = max(box[2] for box in char_boxes)
1540
  top = max(box[3] for box in char_boxes) + vertical_padding
1541
+ bbox = [left, bottom, right, top]
1542
+ bounding_boxes.append((bottom, left, result, bbox, char_text)) # (y, x, result, bbox, text)
1543
 
1544
+ # Store original bounding boxes
1545
+ original_bounding_boxes.append({"text": "".join(char_text), "boundingBox": bbox, "result": copy.deepcopy(result)})
1546
+ #print("Original bounding boxes:", original_bounding_boxes)
1547
 
1548
  # Sort the results by y-coordinate and then by x-coordinate
1549
  bounding_boxes.sort()
 
1554
  current_result = None
1555
  current_text = []
1556
 
1557
+ for y, x, result, next_box, text in bounding_boxes:
 
 
 
1558
  if current_y is None or current_box is None:
1559
+ # Initialize the first bounding box
1560
+ current_box = next_box
1561
+ current_y = next_box[1]
1562
  current_result = result
1563
  current_text = list(text)
 
1564
  else:
1565
+ vertical_diff_bboxes = abs(next_box[1] - current_y)
1566
+ horizontal_diff_bboxes = abs(next_box[0] - current_box[2])
1567
+
1568
+ if vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist:
1569
+ # Merge bounding boxes
1570
+ #print("Merging boxes")
1571
+ merged_box = current_box.copy()
1572
+ merged_result = current_result
1573
+ merged_text = current_text.copy()
1574
+
1575
+ #print("current_box_max_x:", current_box[2])
1576
+ #print("char_max_x:", next_box[2])
1577
+
1578
+ merged_box[2] = next_box[2] # Extend horizontally
1579
+ merged_box[3] = max(current_box[3], next_box[3]) # Adjust the top
1580
+ merged_result.end = max(current_result.end, result.end) # Extend text range
1581
  try:
1582
+ merged_result.entity_type = current_result.entity_type + " - " + result.entity_type
1583
  except Exception as e:
1584
+ print("Unable to combine result entity types:", e)
 
 
1585
  if current_text:
1586
+ merged_text.append(" ") # Add space between texts
1587
+ merged_text.extend(text)
1588
+
1589
+ merged_bounding_boxes.append({
1590
+ "text": "".join(merged_text),
1591
+ "boundingBox": merged_box,
1592
+ "result": merged_result
1593
+ })
1594
 
 
1595
  else:
1596
+ # Save the current merged box before starting a new one
1597
+ # merged_bounding_boxes.append({
1598
+ # "text": "".join(current_text),
1599
+ # "boundingBox": current_box,
1600
+ # "result": current_result
1601
+ # })
1602
+ # Start a new bounding box
1603
+ current_box = next_box
1604
+ current_y = next_box[1]
1605
  current_result = result
1606
  current_text = list(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1607
 
1608
+ # Handle the last box
1609
+ # if current_box is not None:
1610
+ # merged_bounding_boxes.append({
1611
+ # "text": "".join(current_text),
1612
+ # "boundingBox": current_box,
1613
+ # "result": current_result
1614
+ # })
1615
+
1616
+ # Combine original and merged bounding boxes
1617
+ analysed_bounding_boxes.extend(original_bounding_boxes)
1618
+ analysed_bounding_boxes.extend(merged_bounding_boxes)
1619
+
1620
+ #print("Analysed bounding boxes:", analysed_bounding_boxes)
1621
+
1622
  return analysed_bounding_boxes
1623
 
1624
 
1625
+ # def merge_text_bounding_boxes(analyser_results, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
1626
+ # '''
1627
+ # Merge identified bounding boxes containing PII that are very close to one another
1628
+ # '''
1629
+ # analysed_bounding_boxes = []
1630
+ # if len(analyser_results) > 0 and len(characters) > 0:
1631
+ # # Extract bounding box coordinates for sorting
1632
+ # bounding_boxes = []
1633
+ # text_out = []
1634
+ # for result in analyser_results:
1635
+ # char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1636
+ # char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
1637
+ # if char_boxes:
1638
+ # # Calculate the bounding box that encompasses all characters
1639
+ # left = min(box[0] for box in char_boxes)
1640
+ # bottom = min(box[1] for box in char_boxes)
1641
+ # right = max(box[2] for box in char_boxes)
1642
+ # top = max(box[3] for box in char_boxes) + vertical_padding
1643
+ # bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text)) # (y, x, result, bbox, text)
1644
+
1645
+ # char_text = "".join(char_text)
1646
+
1647
+ # # Sort the results by y-coordinate and then by x-coordinate
1648
+ # bounding_boxes.sort()
1649
+
1650
+ # merged_bounding_boxes = []
1651
+ # current_box = None
1652
+ # current_y = None
1653
+ # current_result = None
1654
+ # current_text = []
1655
+
1656
+ # for y, x, result, char_box, text in bounding_boxes:
1657
+ # #print(f"Considering result: {result}")
1658
+ # #print(f"Character box: {char_box}")
1659
+
1660
+ # if current_y is None or current_box is None:
1661
+ # current_box = char_box
1662
+ # current_y = char_box[1]
1663
+ # current_result = result
1664
+ # current_text = list(text)
1665
+ # #print(f"Starting new box: {current_box}")
1666
+ # else:
1667
+ # vertical_diff_bboxes = abs(char_box[1] - current_y)
1668
+ # horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
1669
+
1670
+ # if (
1671
+ # vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
1672
+ # ):
1673
+ # #print("box is being extended")
1674
+ # current_box[2] = char_box[2] # Extend the current box horizontally
1675
+ # current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
1676
+ # current_result.end = max(current_result.end, result.end) # Extend the text range
1677
+ # try:
1678
+ # current_result.entity_type = current_result.entity_type + " - " + result.entity_type
1679
+ # except Exception as e:
1680
+ # print("Unable to combine result entity types:")
1681
+ # print(e)
1682
+ # # Add a space if current_text is not empty
1683
+ # if current_text:
1684
+ # current_text.append(" ") # Add space between texts
1685
+ # current_text.extend(text)
1686
+
1687
+ # #print(f"Latest merged box: {current_box[-1]}")
1688
+ # else:
1689
+ # merged_bounding_boxes.append(
1690
+ # {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
1691
+
1692
+ # # Reset current_box and current_y after appending
1693
+ # current_box = char_box
1694
+ # current_y = char_box[1]
1695
+ # current_result = result
1696
+ # current_text = list(text)
1697
+
1698
+ # # After finishing with the current result, add the last box for this result
1699
+ # if current_box:
1700
+ # merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
1701
+
1702
+ # if not merged_bounding_boxes:
1703
+ # analysed_bounding_boxes.extend(
1704
+ # {"text":text, "boundingBox": char.bbox, "result": result}
1705
+ # for result in analyser_results
1706
+ # for char in characters[result.start:result.end]
1707
+ # if isinstance(char, LTChar)
1708
+ # )
1709
+ # else:
1710
+ # analysed_bounding_boxes.extend(merged_bounding_boxes)
1711
+
1712
+ # return analysed_bounding_boxes
1713
+
1714
 
1715
  def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
1716
  decision_process_table = pd.DataFrame()
tools/helper_functions.py CHANGED
@@ -3,6 +3,7 @@ import re
3
  import gradio as gr
4
  import pandas as pd
5
  import unicodedata
 
6
  from gradio_image_annotation import image_annotator
7
 
8
  def reset_state_vars():
@@ -38,13 +39,11 @@ textract_option = "AWS Textract service - all PDF types"
38
  local_pii_detector = "Local"
39
  aws_pii_detector = "AWS Comprehend"
40
 
 
 
41
 
42
- # Retrieving or setting output folder
43
- env_var_name = 'GRADIO_OUTPUT_FOLDER'
44
- default_value = 'output/'
45
-
46
- output_folder = get_or_create_env_var(env_var_name, default_value)
47
- print(f'The value of {env_var_name} is {output_folder}')
48
 
49
  def load_in_default_allow_list(allow_list_file_path):
50
  if isinstance(allow_list_file_path, str):
@@ -105,7 +104,7 @@ def ensure_output_folder_exists():
105
  else:
106
  print(f"The 'output/' folder already exists.")
107
 
108
- def custom_regex_load(in_file):
109
  '''
110
  When file is loaded, update the column dropdown choices and write to relevant data states.
111
  '''
@@ -113,6 +112,7 @@ def custom_regex_load(in_file):
113
  custom_regex = pd.DataFrame()
114
 
115
  if in_file:
 
116
 
117
  file_list = [string.name for string in in_file]
118
 
@@ -122,13 +122,13 @@ def custom_regex_load(in_file):
122
  custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
123
  #regex_file_name_no_ext = get_file_path_end(regex_file_name)
124
 
125
- output_text = "Allow list file loaded."
 
126
  print(output_text)
127
  else:
128
- error = "No allow list file provided."
129
- print(error)
130
- output_text = error
131
- return error, custom_regex
132
 
133
  return output_text, custom_regex
134
 
 
3
  import gradio as gr
4
  import pandas as pd
5
  import unicodedata
6
+ from typing import List
7
  from gradio_image_annotation import image_annotator
8
 
9
  def reset_state_vars():
 
39
  local_pii_detector = "Local"
40
  aws_pii_detector = "AWS Comprehend"
41
 
42
+ output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
43
+ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
44
 
45
+ input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
46
+ print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
 
 
 
 
47
 
48
  def load_in_default_allow_list(allow_list_file_path):
49
  if isinstance(allow_list_file_path, str):
 
104
  else:
105
  print(f"The 'output/' folder already exists.")
106
 
107
+ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
108
  '''
109
  When file is loaded, update the column dropdown choices and write to relevant data states.
110
  '''
 
112
  custom_regex = pd.DataFrame()
113
 
114
  if in_file:
115
+ print("File type:", file_type)
116
 
117
  file_list = [string.name for string in in_file]
118
 
 
122
  custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
123
  #regex_file_name_no_ext = get_file_path_end(regex_file_name)
124
 
125
+ output_text = file_type + " file loaded."
126
+
127
  print(output_text)
128
  else:
129
+ output_text = "No file provided."
130
+ print(output_text)
131
+ return output_text, custom_regex
 
132
 
133
  return output_text, custom_regex
134
 
tools/redaction_review.py CHANGED
@@ -1,13 +1,15 @@
1
  import gradio as gr
 
2
  import numpy as np
3
  from typing import List
4
  from gradio_image_annotation import image_annotator
5
  from gradio_image_annotation.image_annotator import AnnotatedImageData
6
 
7
- from tools.file_conversion import is_pdf, convert_pdf_to_images
8
  from tools.helper_functions import get_file_path_end, output_folder
9
  from tools.file_redaction import redact_page_with_pymupdf
10
  import json
 
11
  import pymupdf
12
  from fitz import Document
13
  from PIL import ImageDraw, Image
@@ -138,13 +140,14 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
138
 
139
  return all_image_annotations, current_page, current_page
140
 
141
- def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
142
  '''
143
- Apply modified redactions to a pymupdf
144
  '''
145
  #print("all_image_annotations:", all_image_annotations)
146
 
147
  output_files = []
 
148
 
149
  image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
150
 
@@ -154,86 +157,100 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
154
  print("No image annotations found")
155
  return doc, all_image_annotations
156
 
157
- if isinstance(file_paths, list):
158
- file_path = file_paths[-1].name
159
- else:
160
- file_path = file_paths
161
 
162
- print("file_path:", file_path)
163
- file_base = get_file_path_end(file_path)
164
-
165
- # If working with image docs
166
- if is_pdf(file_path) == False:
167
- pdf_doc = Image.open(file_paths[-1])
 
 
 
168
 
169
- image = pdf_doc
170
 
171
- # try:
172
- # image = Image.open(image_annotated['image'])
173
- # except:
174
- # image = Image.fromarray(image_annotated['image'].astype('uint8'))
175
 
176
- draw = ImageDraw.Draw(pdf_doc)
 
 
 
 
177
 
178
- for img_annotation_box in image_annotated['boxes']:
179
- coords = [img_annotation_box["xmin"],
180
- img_annotation_box["ymin"],
181
- img_annotation_box["xmax"],
182
- img_annotation_box["ymax"]]
183
 
184
- fill = img_annotation_box["color"]
185
 
186
- draw.rectangle(coords, fill=fill)
187
 
188
- image.save(output_folder + file_base + "_redacted.png")
189
 
190
- doc = [image]
 
 
191
 
192
- # If working with pdfs
193
- else:
194
- pdf_doc = pymupdf.open(file_path)
195
-
196
- number_of_pages = pdf_doc.page_count
197
-
198
- print("Saving pages to file.")
199
-
200
- for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
201
-
202
- #print("Saving page", str(i))
203
-
204
- image_loc = all_image_annotations[i]['image']
205
- #print("Image location:", image_loc)
206
-
207
- # Load in image object
208
- if isinstance(image_loc, np.ndarray):
209
- image = Image.fromarray(image_loc.astype('uint8'))
210
- #all_image_annotations[i]['image'] = image_loc.tolist()
211
- elif isinstance(image_loc, Image.Image):
212
- image = image_loc
213
- #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
214
- #image_loc.save(image_out_folder)
215
- #all_image_annotations[i]['image'] = image_out_folder
216
- elif isinstance(image_loc, str):
217
- image = Image.open(image_loc)
218
-
219
- pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
220
- pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
221
-
222
- #try:
223
- out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
224
- pdf_doc.save(out_pdf_file_path)
225
- output_files.append(out_pdf_file_path)
226
-
227
- # Save the gradio_annotation_boxes to a JSON file
228
- try:
229
- out_annotation_file_path = output_folder + file_base + '_redactions.json'
230
- with open(out_annotation_file_path, 'w') as f:
231
- json.dump(all_image_annotations, f)
232
- output_files.append(out_annotation_file_path)
233
- except:
234
- print("Could not save annotations to json file.")
235
-
236
- return doc, all_image_annotations, output_files
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
 
238
  def crop(annotations:AnnotatedImageData):
239
  if annotations["boxes"]:
@@ -246,3 +263,21 @@ def crop(annotations:AnnotatedImageData):
246
 
247
  def get_boxes_json(annotations:AnnotatedImageData):
248
  return annotations["boxes"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ import pandas as pd
3
  import numpy as np
4
  from typing import List
5
  from gradio_image_annotation import image_annotator
6
  from gradio_image_annotation.image_annotator import AnnotatedImageData
7
 
8
+ from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
9
  from tools.helper_functions import get_file_path_end, output_folder
10
  from tools.file_redaction import redact_page_with_pymupdf
11
  import json
12
+ import os
13
  import pymupdf
14
  from fitz import Document
15
  from PIL import ImageDraw, Image
 
140
 
141
  return all_image_annotations, current_page, current_page
142
 
143
+ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, progress=gr.Progress(track_tqdm=True)):
144
  '''
145
+ Apply modified redactions to a pymupdf and export review files
146
  '''
147
  #print("all_image_annotations:", all_image_annotations)
148
 
149
  output_files = []
150
+ output_log_files = []
151
 
152
  image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
153
 
 
157
  print("No image annotations found")
158
  return doc, all_image_annotations
159
 
160
+ if isinstance(file_paths, str):
161
+ file_paths = [file_paths]
 
 
162
 
163
+ for file_path in file_paths:
164
+ print("file_path:", file_path)
165
+ file_base = get_file_path_end(file_path)
166
+
167
+ file_extension = os.path.splitext(file_path)[1].lower()
168
+
169
+ # If working with image docs
170
+ if (is_pdf(file_path) == False) & (file_extension not in '.csv'):
171
+ image = Image.open(file_paths[-1])
172
 
173
+ #image = pdf_doc
174
 
175
+ draw = ImageDraw.Draw(image)
 
 
 
176
 
177
+ for img_annotation_box in image_annotated['boxes']:
178
+ coords = [img_annotation_box["xmin"],
179
+ img_annotation_box["ymin"],
180
+ img_annotation_box["xmax"],
181
+ img_annotation_box["ymax"]]
182
 
183
+ fill = img_annotation_box["color"]
 
 
 
 
184
 
185
+ draw.rectangle(coords, fill=fill)
186
 
187
+ image.save(output_folder + file_base + "_redacted.png")
188
 
189
+ doc = [image]
190
 
191
+ elif file_extension in '.csv':
192
+ print("This is a csv")
193
+ pdf_doc = []
194
 
195
+ # If working with pdfs
196
+ elif is_pdf(file_path) == True:
197
+ pdf_doc = pymupdf.open(file_path)
198
+
199
+ number_of_pages = pdf_doc.page_count
200
+
201
+ print("Saving pages to file.")
202
+
203
+ for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
204
+
205
+ #print("Saving page", str(i))
206
+
207
+ image_loc = all_image_annotations[i]['image']
208
+ #print("Image location:", image_loc)
209
+
210
+ # Load in image object
211
+ if isinstance(image_loc, np.ndarray):
212
+ image = Image.fromarray(image_loc.astype('uint8'))
213
+ #all_image_annotations[i]['image'] = image_loc.tolist()
214
+ elif isinstance(image_loc, Image.Image):
215
+ image = image_loc
216
+ #image_out_folder = output_folder + file_base + "_page_" + str(i) + ".png"
217
+ #image_loc.save(image_out_folder)
218
+ #all_image_annotations[i]['image'] = image_out_folder
219
+ elif isinstance(image_loc, str):
220
+ image = Image.open(image_loc)
221
+
222
+ pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
223
+ pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
224
+
225
+ else:
226
+ print("File type not recognised.")
227
+
228
+ #try:
229
+ if pdf_doc:
230
+ out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
231
+ pdf_doc.save(out_pdf_file_path)
232
+ output_files.append(out_pdf_file_path)
233
+
234
+ try:
235
+ # print("Saving annotations to JSON")
236
+
237
+ out_annotation_file_path = output_folder + file_base + '_review_file.json'
238
+ with open(out_annotation_file_path, 'w') as f:
239
+ json.dump(all_image_annotations, f)
240
+ output_log_files.append(out_annotation_file_path)
241
+
242
+ print("Saving annotations to CSV review file")
243
+
244
+ # Convert json to csv and also save this
245
+ review_df = convert_review_json_to_pandas_df(all_image_annotations)
246
+ out_review_file_file_path = output_folder + file_base + '_review_file.csv'
247
+ review_df.to_csv(out_review_file_file_path, index=None)
248
+ output_files.append(out_review_file_file_path)
249
+
250
+ except Exception as e:
251
+ print("Could not save annotations to json file:", e)
252
+
253
+ return doc, all_image_annotations, output_files, output_log_files
254
 
255
  def crop(annotations:AnnotatedImageData):
256
  if annotations["boxes"]:
 
263
 
264
  def get_boxes_json(annotations:AnnotatedImageData):
265
  return annotations["boxes"]
266
+ # Group the DataFrame by the 'image' column
267
+ grouped = df.groupby('image')
268
+
269
+ # Create a list to hold the JSON data
270
+ json_data = []
271
+
272
+ # Iterate over each group
273
+ for image_path, group in grouped:
274
+ # Convert each group to a list of box dictionaries
275
+ boxes = group.drop(columns='image').to_dict(orient='records')
276
+
277
+ # Append the structured data to the json_data list
278
+ json_data.append({
279
+ "image": image_path,
280
+ "boxes": boxes
281
+ })
282
+
283
+ return json_data