Commit
·
a03496e
1
Parent(s):
59ff822
Side review bar is mostly there. A couple of bugs fixed. Can now return identified text in initial review files. Still working on retaining found text throughout review process
Browse files- app.py +28 -17
- tools/file_conversion.py +34 -9
- tools/file_redaction.py +134 -274
- tools/redaction_review.py +28 -6
app.py
CHANGED
|
@@ -8,6 +8,7 @@ import gradio as gr
|
|
| 8 |
import pandas as pd
|
| 9 |
from datetime import datetime
|
| 10 |
from gradio_image_annotation import image_annotator
|
|
|
|
| 11 |
|
| 12 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
| 13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
|
@@ -80,6 +81,8 @@ with app:
|
|
| 80 |
output_file_list_state = gr.State([])
|
| 81 |
text_output_file_list_state = gr.State([])
|
| 82 |
log_files_output_list_state = gr.State([])
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# Logging state
|
| 85 |
log_file_name = 'log.csv'
|
|
@@ -113,7 +116,7 @@ with app:
|
|
| 113 |
|
| 114 |
|
| 115 |
## Annotator zoom value
|
| 116 |
-
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=
|
| 117 |
zoom_true_bool = gr.State(True)
|
| 118 |
zoom_false_bool = gr.State(False)
|
| 119 |
|
|
@@ -204,9 +207,9 @@ with app:
|
|
| 204 |
annotate_zoom_in = gr.Button("Zoom in")
|
| 205 |
annotate_zoom_out = gr.Button("Zoom out")
|
| 206 |
with gr.Row():
|
| 207 |
-
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page")
|
|
|
|
| 208 |
|
| 209 |
-
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
| 210 |
|
| 211 |
with gr.Row():
|
| 212 |
|
|
@@ -233,10 +236,8 @@ with app:
|
|
| 233 |
)
|
| 234 |
|
| 235 |
with gr.Column(scale=1):
|
| 236 |
-
recogniser_entity_dropdown = gr.Dropdown(value="ALL", allow_custom_value=True)
|
| 237 |
-
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas")
|
| 238 |
-
|
| 239 |
-
|
| 240 |
|
| 241 |
with gr.Row():
|
| 242 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
|
@@ -245,6 +246,9 @@ with app:
|
|
| 245 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
| 246 |
|
| 247 |
|
|
|
|
|
|
|
|
|
|
| 248 |
# TEXT / TABULAR DATA TAB
|
| 249 |
with gr.Tab(label="Open text or Excel/csv files"):
|
| 250 |
gr.Markdown(
|
|
@@ -304,7 +308,7 @@ with app:
|
|
| 304 |
|
| 305 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
| 306 |
|
| 307 |
-
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting"])
|
| 308 |
#with gr.Row():
|
| 309 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
| 310 |
|
|
@@ -319,7 +323,7 @@ with app:
|
|
| 319 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
| 320 |
|
| 321 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
| 322 |
-
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state], api_name="prepare_doc").\
|
| 323 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 324 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
| 325 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
@@ -339,49 +343,56 @@ with app:
|
|
| 339 |
|
| 340 |
# Upload previous files for modifying redactions
|
| 341 |
upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
| 342 |
-
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state]).\
|
| 343 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 344 |
|
| 345 |
# Page controls at top
|
| 346 |
annotate_current_page.submit(
|
| 347 |
-
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 348 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 349 |
|
|
|
|
|
|
|
| 350 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
|
|
|
| 351 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 352 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
|
|
|
| 353 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 354 |
|
| 355 |
# Zoom in and out on annotator
|
| 356 |
-
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 357 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
| 358 |
|
| 359 |
-
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 360 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
| 361 |
|
| 362 |
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 363 |
|
| 364 |
-
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 365 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 366 |
|
| 367 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
| 368 |
-
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
| 369 |
|
| 370 |
# Page controls at bottom
|
| 371 |
annotate_current_page_bottom.submit(
|
| 372 |
-
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
| 373 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 374 |
|
| 375 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
|
|
|
| 376 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
|
| 377 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
|
|
|
| 378 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 379 |
|
| 380 |
# Review side bar controls
|
| 381 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
| 382 |
|
| 383 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
|
| 384 |
-
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 385 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 386 |
|
| 387 |
|
|
|
|
| 8 |
import pandas as pd
|
| 9 |
from datetime import datetime
|
| 10 |
from gradio_image_annotation import image_annotator
|
| 11 |
+
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 12 |
|
| 13 |
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
| 14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
|
|
|
| 81 |
output_file_list_state = gr.State([])
|
| 82 |
text_output_file_list_state = gr.State([])
|
| 83 |
log_files_output_list_state = gr.State([])
|
| 84 |
+
|
| 85 |
+
review_file_state = gr.State(pd.DataFrame())
|
| 86 |
|
| 87 |
# Logging state
|
| 88 |
log_file_name = 'log.csv'
|
|
|
|
| 116 |
|
| 117 |
|
| 118 |
## Annotator zoom value
|
| 119 |
+
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
|
| 120 |
zoom_true_bool = gr.State(True)
|
| 121 |
zoom_false_bool = gr.State(False)
|
| 122 |
|
|
|
|
| 207 |
annotate_zoom_in = gr.Button("Zoom in")
|
| 208 |
annotate_zoom_out = gr.Button("Zoom out")
|
| 209 |
with gr.Row():
|
| 210 |
+
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
| 211 |
+
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
| 212 |
|
|
|
|
| 213 |
|
| 214 |
with gr.Row():
|
| 215 |
|
|
|
|
| 236 |
)
|
| 237 |
|
| 238 |
with gr.Column(scale=1):
|
| 239 |
+
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
| 240 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
|
|
|
|
|
|
| 241 |
|
| 242 |
with gr.Row():
|
| 243 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
|
|
|
| 246 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
| 247 |
|
| 248 |
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
|
| 252 |
# TEXT / TABULAR DATA TAB
|
| 253 |
with gr.Tab(label="Open text or Excel/csv files"):
|
| 254 |
gr.Markdown(
|
|
|
|
| 308 |
|
| 309 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
| 310 |
|
| 311 |
+
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
| 312 |
#with gr.Row():
|
| 313 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False, visible=False)
|
| 314 |
|
|
|
|
| 323 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
| 324 |
|
| 325 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
| 326 |
+
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
| 327 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 328 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
| 329 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
|
|
|
| 343 |
|
| 344 |
# Upload previous files for modifying redactions
|
| 345 |
upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
| 346 |
+
then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
|
| 347 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 348 |
|
| 349 |
# Page controls at top
|
| 350 |
annotate_current_page.submit(
|
| 351 |
+
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 352 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 353 |
|
| 354 |
+
|
| 355 |
+
|
| 356 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 357 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 358 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 359 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 360 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 361 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 362 |
|
| 363 |
# Zoom in and out on annotator
|
| 364 |
+
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 365 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
| 366 |
|
| 367 |
+
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 368 |
then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
| 369 |
|
| 370 |
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 371 |
|
| 372 |
+
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 373 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 374 |
|
| 375 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
| 376 |
+
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
| 377 |
|
| 378 |
# Page controls at bottom
|
| 379 |
annotate_current_page_bottom.submit(
|
| 380 |
+
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 381 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 382 |
|
| 383 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 384 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 385 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 386 |
+
|
| 387 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 388 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 389 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 390 |
|
| 391 |
# Review side bar controls
|
| 392 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
| 393 |
|
| 394 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=annotate_current_page).\
|
| 395 |
+
then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
| 396 |
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 397 |
|
| 398 |
|
tools/file_conversion.py
CHANGED
|
@@ -399,6 +399,7 @@ def prepare_image_or_pdf(
|
|
| 399 |
converted_file_paths = []
|
| 400 |
image_file_paths = []
|
| 401 |
pymupdf_doc = []
|
|
|
|
| 402 |
|
| 403 |
if not file_paths:
|
| 404 |
file_paths = []
|
|
@@ -424,7 +425,7 @@ def prepare_image_or_pdf(
|
|
| 424 |
final_out_message = '\n'.join(out_message)
|
| 425 |
else:
|
| 426 |
final_out_message = out_message
|
| 427 |
-
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
| 428 |
|
| 429 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 430 |
|
|
@@ -457,7 +458,7 @@ def prepare_image_or_pdf(
|
|
| 457 |
if not file_path:
|
| 458 |
out_message = "Please select a file."
|
| 459 |
print(out_message)
|
| 460 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
| 461 |
|
| 462 |
file_extension = os.path.splitext(file_path)[1].lower()
|
| 463 |
|
|
@@ -478,7 +479,7 @@ def prepare_image_or_pdf(
|
|
| 478 |
|
| 479 |
all_annotations_object.append(annotation)
|
| 480 |
|
| 481 |
-
print("all_annotations_object:", all_annotations_object)
|
| 482 |
|
| 483 |
|
| 484 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
|
@@ -597,13 +598,13 @@ def prepare_image_or_pdf(
|
|
| 597 |
if is_pdf_or_image(file_path) == False:
|
| 598 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
| 599 |
print(out_message)
|
| 600 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
| 601 |
|
| 602 |
elif in_redact_method == text_ocr_option:
|
| 603 |
if is_pdf(file_path) == False:
|
| 604 |
out_message = "Please upload a PDF file for text analysis."
|
| 605 |
print(out_message)
|
| 606 |
-
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
| 607 |
|
| 608 |
|
| 609 |
converted_file_paths.append(converted_file_path)
|
|
@@ -624,7 +625,7 @@ def prepare_image_or_pdf(
|
|
| 624 |
|
| 625 |
#print("all_annotations_object at end:", all_annotations_object)
|
| 626 |
|
| 627 |
-
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object
|
| 628 |
|
| 629 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
| 630 |
file_path_without_ext = get_file_path_end(in_file_path)
|
|
@@ -650,7 +651,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
|
|
| 650 |
return out_message, out_file_paths
|
| 651 |
|
| 652 |
|
| 653 |
-
def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
|
| 654 |
# Flatten the data
|
| 655 |
flattened_data = []
|
| 656 |
|
|
@@ -670,16 +671,40 @@ def convert_review_json_to_pandas_df(data:List[dict]) -> pd.DataFrame:
|
|
| 670 |
|
| 671 |
# Check if 'boxes' is in the entry, if not, add an empty list
|
| 672 |
if 'boxes' not in entry:
|
| 673 |
-
entry['boxes'] = []
|
| 674 |
|
| 675 |
for box in entry["boxes"]:
|
| 676 |
-
|
|
|
|
|
|
|
|
|
|
| 677 |
#print("data_to_add:", data_to_add)
|
| 678 |
flattened_data.append(data_to_add)
|
| 679 |
|
| 680 |
# Convert to a DataFrame
|
| 681 |
df = pd.DataFrame(flattened_data)
|
| 682 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 683 |
return df
|
| 684 |
|
| 685 |
def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
|
|
|
|
| 399 |
converted_file_paths = []
|
| 400 |
image_file_paths = []
|
| 401 |
pymupdf_doc = []
|
| 402 |
+
review_file_csv = pd.DataFrame()
|
| 403 |
|
| 404 |
if not file_paths:
|
| 405 |
file_paths = []
|
|
|
|
| 425 |
final_out_message = '\n'.join(out_message)
|
| 426 |
else:
|
| 427 |
final_out_message = out_message
|
| 428 |
+
return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
| 429 |
|
| 430 |
#in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
| 431 |
|
|
|
|
| 458 |
if not file_path:
|
| 459 |
out_message = "Please select a file."
|
| 460 |
print(out_message)
|
| 461 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
| 462 |
|
| 463 |
file_extension = os.path.splitext(file_path)[1].lower()
|
| 464 |
|
|
|
|
| 479 |
|
| 480 |
all_annotations_object.append(annotation)
|
| 481 |
|
| 482 |
+
#print("all_annotations_object:", all_annotations_object)
|
| 483 |
|
| 484 |
|
| 485 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
|
|
|
| 598 |
if is_pdf_or_image(file_path) == False:
|
| 599 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
| 600 |
print(out_message)
|
| 601 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
| 602 |
|
| 603 |
elif in_redact_method == text_ocr_option:
|
| 604 |
if is_pdf(file_path) == False:
|
| 605 |
out_message = "Please upload a PDF file for text analysis."
|
| 606 |
print(out_message)
|
| 607 |
+
return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
| 608 |
|
| 609 |
|
| 610 |
converted_file_paths.append(converted_file_path)
|
|
|
|
| 625 |
|
| 626 |
#print("all_annotations_object at end:", all_annotations_object)
|
| 627 |
|
| 628 |
+
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
| 629 |
|
| 630 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
| 631 |
file_path_without_ext = get_file_path_end(in_file_path)
|
|
|
|
| 651 |
return out_message, out_file_paths
|
| 652 |
|
| 653 |
|
| 654 |
+
def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFrame) -> pd.DataFrame:
|
| 655 |
# Flatten the data
|
| 656 |
flattened_data = []
|
| 657 |
|
|
|
|
| 671 |
|
| 672 |
# Check if 'boxes' is in the entry, if not, add an empty list
|
| 673 |
if 'boxes' not in entry:
|
| 674 |
+
entry['boxes'] = []
|
| 675 |
|
| 676 |
for box in entry["boxes"]:
|
| 677 |
+
if 'text' not in box:
|
| 678 |
+
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": entry['text'],
|
| 679 |
+
else:
|
| 680 |
+
data_to_add = {"image": image_path, "page": reported_number, "text": entry['text'], **box}
|
| 681 |
#print("data_to_add:", data_to_add)
|
| 682 |
flattened_data.append(data_to_add)
|
| 683 |
|
| 684 |
# Convert to a DataFrame
|
| 685 |
df = pd.DataFrame(flattened_data)
|
| 686 |
|
| 687 |
+
# Join on additional text data from decision output results if included
|
| 688 |
+
if not text_join_data.empty:
|
| 689 |
+
#print("text_join_data:", text_join_data)
|
| 690 |
+
#print("df:", df)
|
| 691 |
+
text_join_data['page'] = text_join_data['page'].astype(str)
|
| 692 |
+
df['page'] = df['page'].astype(str)
|
| 693 |
+
text_join_data = text_join_data[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
|
| 694 |
+
text_join_data[['xmin', 'ymin', 'xmax', 'ymax']] = text_join_data[['xmin', 'ymin', 'xmax', 'ymax']].astype(float).round(0)
|
| 695 |
+
df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float).round(0)
|
| 696 |
+
|
| 697 |
+
df = df.merge(text_join_data, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
|
| 698 |
+
|
| 699 |
+
df = df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
|
| 700 |
+
|
| 701 |
+
df = df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
|
| 702 |
+
|
| 703 |
+
if 'text' not in df.columns:
|
| 704 |
+
df['text'] = ''
|
| 705 |
+
|
| 706 |
+
df = df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
| 707 |
+
|
| 708 |
return df
|
| 709 |
|
| 710 |
def convert_pandas_df_to_review_json(df: pd.DataFrame) -> List[dict]:
|
tools/file_redaction.py
CHANGED
|
@@ -40,6 +40,11 @@ print(f'The value of page_break_value is {page_break_value}')
|
|
| 40 |
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
| 41 |
print(f'The value of max_time_value is {max_time_value}')
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
def sum_numbers_before_seconds(string:str):
|
| 44 |
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
| 45 |
|
|
@@ -396,7 +401,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 396 |
# Convert json to csv and also save this
|
| 397 |
#print("annotations_all_pages:", annotations_all_pages)
|
| 398 |
|
| 399 |
-
review_df = convert_review_json_to_pandas_df(annotations_all_pages)
|
| 400 |
|
| 401 |
out_review_file_file_path = out_image_file_path + '_review_file.csv'
|
| 402 |
review_df.to_csv(out_review_file_file_path, index=None)
|
|
@@ -452,7 +457,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 452 |
|
| 453 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
| 454 |
|
| 455 |
-
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
|
| 456 |
'''
|
| 457 |
Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
|
| 458 |
'''
|
|
@@ -474,7 +479,10 @@ def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
|
|
| 474 |
x_diff_ratio = media_reference_x_diff / reference_box_width
|
| 475 |
|
| 476 |
# Extract the annotation rectangle field
|
| 477 |
-
|
|
|
|
|
|
|
|
|
|
| 478 |
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
|
| 479 |
|
| 480 |
# Unpack coordinates
|
|
@@ -487,7 +495,7 @@ def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
|
|
| 487 |
|
| 488 |
return new_x1, new_y1, new_x2, new_y2
|
| 489 |
|
| 490 |
-
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
| 491 |
'''
|
| 492 |
Convert annotations from pikepdf coordinates to image coordinates.
|
| 493 |
'''
|
|
@@ -504,7 +512,10 @@ def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
|
| 504 |
scale_height = image_page_height / rect_height
|
| 505 |
|
| 506 |
# Extract the /Rect field
|
| 507 |
-
|
|
|
|
|
|
|
|
|
|
| 508 |
|
| 509 |
# Convert the extracted /Rect field to a list of floats
|
| 510 |
rect_coordinates = [float(coord) for coord in rect_field]
|
|
@@ -518,9 +529,30 @@ def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
|
| 518 |
|
| 519 |
return x1_image, new_y1_image, x2_image, new_y2_image
|
| 520 |
|
| 521 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
'''
|
| 523 |
-
Converts an image with redaction coordinates from a CustomImageRecognizerResult to pymupdf coordinates.
|
| 524 |
'''
|
| 525 |
|
| 526 |
rect_height = pymupdf_page.rect.height
|
|
@@ -533,14 +565,29 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
|
|
| 533 |
scale_height = rect_height / image_page_height
|
| 534 |
|
| 535 |
# Calculate scaled coordinates
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 540 |
|
| 541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
|
|
|
| 544 |
|
| 545 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
| 546 |
'''
|
|
@@ -577,7 +624,7 @@ def move_page_info(file_path: str) -> str:
|
|
| 577 |
|
| 578 |
return new_file_path
|
| 579 |
|
| 580 |
-
def redact_page_with_pymupdf(page:Page,
|
| 581 |
|
| 582 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
| 583 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
|
@@ -599,10 +646,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
| 599 |
image = Image.open(image_path)
|
| 600 |
|
| 601 |
# Check if this is an object used in the Gradio Annotation component
|
| 602 |
-
if isinstance (
|
| 603 |
-
|
| 604 |
|
| 605 |
-
for annot in
|
| 606 |
# Check if an Image recogniser result, or a Gradio annotation object
|
| 607 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
| 608 |
|
|
@@ -611,12 +658,16 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
| 611 |
# Should already be in correct format if img_annotator_box is an input
|
| 612 |
if isinstance(annot, dict):
|
| 613 |
img_annotation_box = annot
|
| 614 |
-
|
| 615 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
|
| 616 |
|
| 617 |
x1 = pymupdf_x1
|
| 618 |
x2 = pymupdf_x2
|
| 619 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
# Else should be CustomImageRecognizerResult
|
| 621 |
else:
|
| 622 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
|
|
@@ -633,12 +684,19 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
| 633 |
img_annotation_box["label"] = annot.entity_type
|
| 634 |
except:
|
| 635 |
img_annotation_box["label"] = "Redaction"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 636 |
|
| 637 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
| 638 |
|
| 639 |
# Else it should be a pikepdf annotation object
|
| 640 |
-
else:
|
| 641 |
-
|
|
|
|
|
|
|
|
|
|
| 642 |
|
| 643 |
x1 = pymupdf_x1
|
| 644 |
x2 = pymupdf_x2
|
|
@@ -650,6 +708,8 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
| 650 |
if image:
|
| 651 |
img_width, img_height = image.size
|
| 652 |
|
|
|
|
|
|
|
| 653 |
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
|
| 654 |
|
| 655 |
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
|
|
@@ -662,6 +722,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
| 662 |
img_annotation_box["label"] = str(annot["/T"])
|
| 663 |
else:
|
| 664 |
img_annotation_box["label"] = "REDACTION"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 665 |
|
| 666 |
# Convert to a PyMuPDF Rect object
|
| 667 |
#rect = Rect(rect_coordinates)
|
|
@@ -672,29 +736,6 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
| 672 |
|
| 673 |
# If whole page is to be redacted, do that here
|
| 674 |
if redact_whole_page == True:
|
| 675 |
-
# # Small border to page that remains white
|
| 676 |
-
# border = 5
|
| 677 |
-
# # Define the coordinates for the Rect
|
| 678 |
-
# whole_page_x1, whole_page_y1 = 0 + border, 0 + border # Bottom-left corner
|
| 679 |
-
# whole_page_x2, whole_page_y2 = rect_width - border, rect_height - border # Top-right corner
|
| 680 |
-
|
| 681 |
-
# whole_page_image_x1, whole_page_image_y1, whole_page_image_x2, whole_page_image_y2 = convert_pymupdf_to_image_coords(page, whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2, image)
|
| 682 |
-
|
| 683 |
-
# # Create new image annotation element based on whole page coordinates
|
| 684 |
-
# whole_page_rect = Rect(whole_page_x1, whole_page_y1, whole_page_x2, whole_page_y2)
|
| 685 |
-
|
| 686 |
-
# # Write whole page annotation to annotation boxes
|
| 687 |
-
# whole_page_img_annotation_box = {}
|
| 688 |
-
# whole_page_img_annotation_box["xmin"] = whole_page_image_x1
|
| 689 |
-
# whole_page_img_annotation_box["ymin"] = whole_page_image_y1
|
| 690 |
-
# whole_page_img_annotation_box["xmax"] = whole_page_image_x2
|
| 691 |
-
# whole_page_img_annotation_box["ymax"] = whole_page_image_y2
|
| 692 |
-
# whole_page_img_annotation_box["color"] = (0,0,0)
|
| 693 |
-
# whole_page_img_annotation_box["label"] = "Whole page"
|
| 694 |
-
|
| 695 |
-
# redact_single_box(page, whole_page_rect, whole_page_img_annotation_box, custom_colours)
|
| 696 |
-
|
| 697 |
-
# all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
| 698 |
|
| 699 |
whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
|
| 700 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
|
@@ -712,14 +753,7 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page:dict, image:Image.Im
|
|
| 712 |
|
| 713 |
return page, out_annotation_boxes
|
| 714 |
|
| 715 |
-
def bounding_boxes_overlap(box1, box2):
|
| 716 |
-
"""Check if two bounding boxes overlap."""
|
| 717 |
-
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
| 718 |
-
box1[1] < box2[3] and box2[1] < box1[3])
|
| 719 |
|
| 720 |
-
from collections import defaultdict
|
| 721 |
-
from typing import List, Dict
|
| 722 |
-
import copy
|
| 723 |
|
| 724 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
| 725 |
|
|
@@ -822,117 +856,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
| 822 |
unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
|
| 823 |
return unique_bboxes
|
| 824 |
|
| 825 |
-
|
| 826 |
-
# def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
| 827 |
-
# merged_bboxes = []
|
| 828 |
-
# grouped_bboxes = defaultdict(list)
|
| 829 |
-
|
| 830 |
-
# # Process signature and handwriting results
|
| 831 |
-
# if signature_recogniser_results or handwriting_recogniser_results:
|
| 832 |
-
# if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
| 833 |
-
# #print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
| 834 |
-
# merged_bboxes.extend(handwriting_recogniser_results)
|
| 835 |
-
|
| 836 |
-
# if "Redact all identified signatures" in handwrite_signature_checkbox:
|
| 837 |
-
# #print("Signature boxes exist at merge:", signature_recogniser_results)
|
| 838 |
-
# merged_bboxes.extend(signature_recogniser_results)
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
# # Reconstruct bounding boxes for substrings of interest
|
| 842 |
-
# reconstructed_bboxes = []
|
| 843 |
-
# for bbox in bboxes:
|
| 844 |
-
# #print("bbox:", bbox)
|
| 845 |
-
# bbox_box = (bbox.left, bbox.top, bbox.left + bbox.width, bbox.top + bbox.height)
|
| 846 |
-
# for line_text, line_info in combined_results.items():
|
| 847 |
-
# line_box = line_info['bounding_box']
|
| 848 |
-
# if bounding_boxes_overlap(bbox_box, line_box):
|
| 849 |
-
# if bbox.text in line_text:
|
| 850 |
-
# start_char = line_text.index(bbox.text)
|
| 851 |
-
# end_char = start_char + len(bbox.text)
|
| 852 |
-
|
| 853 |
-
# relevant_words = []
|
| 854 |
-
# current_char = 0
|
| 855 |
-
# for word in line_info['words']:
|
| 856 |
-
# word_end = current_char + len(word['text'])
|
| 857 |
-
# if current_char <= start_char < word_end or current_char < end_char <= word_end or (start_char <= current_char and word_end <= end_char):
|
| 858 |
-
# relevant_words.append(word)
|
| 859 |
-
# if word_end >= end_char:
|
| 860 |
-
# break
|
| 861 |
-
# current_char = word_end
|
| 862 |
-
# if not word['text'].endswith(' '):
|
| 863 |
-
# current_char += 1 # +1 for space if the word doesn't already end with a space
|
| 864 |
-
|
| 865 |
-
# if relevant_words:
|
| 866 |
-
# #print("Relevant words:", relevant_words)
|
| 867 |
-
# left = min(word['bounding_box'][0] for word in relevant_words)
|
| 868 |
-
# top = min(word['bounding_box'][1] for word in relevant_words)
|
| 869 |
-
# right = max(word['bounding_box'][2] for word in relevant_words)
|
| 870 |
-
# bottom = max(word['bounding_box'][3] for word in relevant_words)
|
| 871 |
-
|
| 872 |
-
# # Combine the text of all relevant words
|
| 873 |
-
# combined_text = " ".join(word['text'] for word in relevant_words)
|
| 874 |
-
|
| 875 |
-
# # Calculate new dimensions for the merged box
|
| 876 |
-
# reconstructed_bbox = CustomImageRecognizerResult(
|
| 877 |
-
# bbox.entity_type,
|
| 878 |
-
# bbox.start,
|
| 879 |
-
# bbox.end,
|
| 880 |
-
# bbox.score,
|
| 881 |
-
# left,
|
| 882 |
-
# top,
|
| 883 |
-
# right - left, # width
|
| 884 |
-
# bottom - top, # height
|
| 885 |
-
# combined_text
|
| 886 |
-
# )
|
| 887 |
-
# # Add both the original and the merged bounding box
|
| 888 |
-
# reconstructed_bboxes.append(bbox) # Retain the original bbox
|
| 889 |
-
# reconstructed_bboxes.append(reconstructed_bbox) # Add the merged bbox
|
| 890 |
-
# break
|
| 891 |
-
# else:
|
| 892 |
-
# # If the bbox text is not found in any line in combined_results, keep the original bbox
|
| 893 |
-
# reconstructed_bboxes.append(bbox)
|
| 894 |
-
|
| 895 |
-
# # Group reconstructed bboxes by approximate vertical proximity
|
| 896 |
-
# for box in reconstructed_bboxes:
|
| 897 |
-
# grouped_bboxes[round(box.top / vertical_threshold)].append(box)
|
| 898 |
-
|
| 899 |
-
# # Merge within each group
|
| 900 |
-
# for _, group in grouped_bboxes.items():
|
| 901 |
-
# group.sort(key=lambda box: box.left)
|
| 902 |
-
|
| 903 |
-
# merged_box = group[0]
|
| 904 |
-
# for next_box in group[1:]:
|
| 905 |
-
# if next_box.left - (merged_box.left + merged_box.width) <= horizontal_threshold:
|
| 906 |
-
# # Calculate new dimensions for the merged box
|
| 907 |
-
# if merged_box.text == next_box.text:
|
| 908 |
-
# new_text = merged_box.text
|
| 909 |
-
# else:
|
| 910 |
-
# new_text = merged_box.text + " " + next_box.text
|
| 911 |
-
|
| 912 |
-
# if merged_box.text == next_box.text:
|
| 913 |
-
# new_text = merged_box.text
|
| 914 |
-
# new_entity_type = merged_box.entity_type # Keep the original entity type
|
| 915 |
-
# else:
|
| 916 |
-
# new_text = merged_box.text + " " + next_box.text
|
| 917 |
-
# new_entity_type = merged_box.entity_type + " - " + next_box.entity_type # Concatenate entity types
|
| 918 |
-
|
| 919 |
-
# new_left = min(merged_box.left, next_box.left)
|
| 920 |
-
# new_top = min(merged_box.top, next_box.top)
|
| 921 |
-
# new_width = max(merged_box.left + merged_box.width, next_box.left + next_box.width) - new_left
|
| 922 |
-
# new_height = max(merged_box.top + merged_box.height, next_box.top + next_box.height) - new_top
|
| 923 |
-
# merged_box = CustomImageRecognizerResult(
|
| 924 |
-
# new_entity_type, merged_box.start, merged_box.end, merged_box.score, new_left, new_top, new_width, new_height, new_text
|
| 925 |
-
# )
|
| 926 |
-
# else:
|
| 927 |
-
# merged_bboxes.append(merged_box)
|
| 928 |
-
# merged_box = next_box
|
| 929 |
-
|
| 930 |
-
# merged_bboxes.append(merged_box)
|
| 931 |
-
|
| 932 |
-
# #print("bboxes:", bboxes)
|
| 933 |
-
|
| 934 |
-
# return merged_bboxes
|
| 935 |
-
|
| 936 |
def redact_image_pdf(file_path:str,
|
| 937 |
prepared_pdf_file_paths:List[str],
|
| 938 |
language:str,
|
|
@@ -1279,17 +1202,21 @@ def redact_image_pdf(file_path:str,
|
|
| 1279 |
|
| 1280 |
# Convert decision process to table
|
| 1281 |
decision_process_table = pd.DataFrame([{
|
| 1282 |
-
'
|
| 1283 |
-
'
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1284 |
'start': result.start,
|
| 1285 |
'end': result.end,
|
| 1286 |
'score': result.score,
|
| 1287 |
-
'
|
| 1288 |
-
|
| 1289 |
-
|
| 1290 |
-
'
|
| 1291 |
-
'
|
| 1292 |
-
|
| 1293 |
|
| 1294 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
|
| 1295 |
|
|
@@ -1323,7 +1250,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1323 |
pymupdf_doc = images
|
| 1324 |
|
| 1325 |
# Check if the image already exists in annotations_all_pages
|
| 1326 |
-
print("annotations_all_pages:", annotations_all_pages)
|
| 1327 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
| 1328 |
if existing_index is not None:
|
| 1329 |
# Replace the existing annotation
|
|
@@ -1346,7 +1273,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1346 |
pymupdf_doc = images
|
| 1347 |
|
| 1348 |
# Check if the image already exists in annotations_all_pages
|
| 1349 |
-
print("annotations_all_pages:", annotations_all_pages)
|
| 1350 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
| 1351 |
if existing_index is not None:
|
| 1352 |
# Replace the existing annotation
|
|
@@ -1595,105 +1522,25 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
|
|
| 1595 |
|
| 1596 |
return analysed_bounding_boxes
|
| 1597 |
|
| 1598 |
-
|
| 1599 |
-
# def merge_text_bounding_boxes(analyser_results, characters:List[LTChar], combine_pixel_dist:int=20, vertical_padding:int=0):
|
| 1600 |
-
# '''
|
| 1601 |
-
# Merge identified bounding boxes containing PII that are very close to one another
|
| 1602 |
-
# '''
|
| 1603 |
-
# analysed_bounding_boxes = []
|
| 1604 |
-
# if len(analyser_results) > 0 and len(characters) > 0:
|
| 1605 |
-
# # Extract bounding box coordinates for sorting
|
| 1606 |
-
# bounding_boxes = []
|
| 1607 |
-
# text_out = []
|
| 1608 |
-
# for result in analyser_results:
|
| 1609 |
-
# char_boxes = [char.bbox for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
| 1610 |
-
# char_text = [char._text for char in characters[result.start:result.end] if isinstance(char, LTChar)]
|
| 1611 |
-
# if char_boxes:
|
| 1612 |
-
# # Calculate the bounding box that encompasses all characters
|
| 1613 |
-
# left = min(box[0] for box in char_boxes)
|
| 1614 |
-
# bottom = min(box[1] for box in char_boxes)
|
| 1615 |
-
# right = max(box[2] for box in char_boxes)
|
| 1616 |
-
# top = max(box[3] for box in char_boxes) + vertical_padding
|
| 1617 |
-
# bounding_boxes.append((bottom, left, result, [left, bottom, right, top], char_text)) # (y, x, result, bbox, text)
|
| 1618 |
-
|
| 1619 |
-
# char_text = "".join(char_text)
|
| 1620 |
-
|
| 1621 |
-
# # Sort the results by y-coordinate and then by x-coordinate
|
| 1622 |
-
# bounding_boxes.sort()
|
| 1623 |
-
|
| 1624 |
-
# merged_bounding_boxes = []
|
| 1625 |
-
# current_box = None
|
| 1626 |
-
# current_y = None
|
| 1627 |
-
# current_result = None
|
| 1628 |
-
# current_text = []
|
| 1629 |
-
|
| 1630 |
-
# for y, x, result, char_box, text in bounding_boxes:
|
| 1631 |
-
# #print(f"Considering result: {result}")
|
| 1632 |
-
# #print(f"Character box: {char_box}")
|
| 1633 |
-
|
| 1634 |
-
# if current_y is None or current_box is None:
|
| 1635 |
-
# current_box = char_box
|
| 1636 |
-
# current_y = char_box[1]
|
| 1637 |
-
# current_result = result
|
| 1638 |
-
# current_text = list(text)
|
| 1639 |
-
# #print(f"Starting new box: {current_box}")
|
| 1640 |
-
# else:
|
| 1641 |
-
# vertical_diff_bboxes = abs(char_box[1] - current_y)
|
| 1642 |
-
# horizontal_diff_bboxes = abs(char_box[0] - current_box[2])
|
| 1643 |
-
|
| 1644 |
-
# if (
|
| 1645 |
-
# vertical_diff_bboxes <= 5 and horizontal_diff_bboxes <= combine_pixel_dist
|
| 1646 |
-
# ):
|
| 1647 |
-
# #print("box is being extended")
|
| 1648 |
-
# current_box[2] = char_box[2] # Extend the current box horizontally
|
| 1649 |
-
# current_box[3] = max(current_box[3], char_box[3]) # Ensure the top is the highest
|
| 1650 |
-
# current_result.end = max(current_result.end, result.end) # Extend the text range
|
| 1651 |
-
# try:
|
| 1652 |
-
# current_result.entity_type = current_result.entity_type + " - " + result.entity_type
|
| 1653 |
-
# except Exception as e:
|
| 1654 |
-
# print("Unable to combine result entity types:")
|
| 1655 |
-
# print(e)
|
| 1656 |
-
# # Add a space if current_text is not empty
|
| 1657 |
-
# if current_text:
|
| 1658 |
-
# current_text.append(" ") # Add space between texts
|
| 1659 |
-
# current_text.extend(text)
|
| 1660 |
-
|
| 1661 |
-
# #print(f"Latest merged box: {current_box[-1]}")
|
| 1662 |
-
# else:
|
| 1663 |
-
# merged_bounding_boxes.append(
|
| 1664 |
-
# {"text":"".join(current_text),"boundingBox": current_box, "result": current_result})
|
| 1665 |
-
|
| 1666 |
-
# # Reset current_box and current_y after appending
|
| 1667 |
-
# current_box = char_box
|
| 1668 |
-
# current_y = char_box[1]
|
| 1669 |
-
# current_result = result
|
| 1670 |
-
# current_text = list(text)
|
| 1671 |
-
|
| 1672 |
-
# # After finishing with the current result, add the last box for this result
|
| 1673 |
-
# if current_box:
|
| 1674 |
-
# merged_bounding_boxes.append({"text":"".join(current_text), "boundingBox": current_box, "result": current_result})
|
| 1675 |
-
|
| 1676 |
-
# if not merged_bounding_boxes:
|
| 1677 |
-
# analysed_bounding_boxes.extend(
|
| 1678 |
-
# {"text":text, "boundingBox": char.bbox, "result": result}
|
| 1679 |
-
# for result in analyser_results
|
| 1680 |
-
# for char in characters[result.start:result.end]
|
| 1681 |
-
# if isinstance(char, LTChar)
|
| 1682 |
-
# )
|
| 1683 |
-
# else:
|
| 1684 |
-
# analysed_bounding_boxes.extend(merged_bounding_boxes)
|
| 1685 |
-
|
| 1686 |
-
# return analysed_bounding_boxes
|
| 1687 |
-
|
| 1688 |
-
|
| 1689 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
| 1690 |
decision_process_table = pd.DataFrame()
|
| 1691 |
|
| 1692 |
if len(analyser_results) > 0:
|
| 1693 |
# Create summary df of annotations to be made
|
| 1694 |
analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1695 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
| 1696 |
-
analysed_bounding_boxes_df_text.columns = ["
|
| 1697 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
| 1698 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
| 1699 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
|
@@ -1702,8 +1549,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
| 1702 |
|
| 1703 |
return decision_process_table
|
| 1704 |
|
| 1705 |
-
def
|
| 1706 |
-
|
| 1707 |
for analysed_bounding_box in analysed_bounding_boxes:
|
| 1708 |
bounding_box = analysed_bounding_box["boundingBox"]
|
| 1709 |
annotation = Dictionary(
|
|
@@ -1721,8 +1568,8 @@ def create_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
| 1721 |
S=Name.S # Border style: solid
|
| 1722 |
)
|
| 1723 |
)
|
| 1724 |
-
|
| 1725 |
-
return
|
| 1726 |
|
| 1727 |
def redact_text_pdf(
|
| 1728 |
filename: str, # Path to the PDF file to be redacted
|
|
@@ -1840,13 +1687,17 @@ def redact_text_pdf(
|
|
| 1840 |
|
| 1841 |
if page_min <= page_no < page_max:
|
| 1842 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1843 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
| 1844 |
|
| 1845 |
page_analyser_results = []
|
| 1846 |
page_analysed_bounding_boxes = []
|
| 1847 |
|
| 1848 |
characters = []
|
| 1849 |
-
|
| 1850 |
decision_process_table_on_page = pd.DataFrame()
|
| 1851 |
page_text_outputs = pd.DataFrame()
|
| 1852 |
|
|
@@ -1900,8 +1751,7 @@ def redact_text_pdf(
|
|
| 1900 |
)
|
| 1901 |
all_text_line_results.append((i, text_line_analyser_result))
|
| 1902 |
|
| 1903 |
-
|
| 1904 |
-
|
| 1905 |
elif pii_identification_method == "AWS Comprehend":
|
| 1906 |
|
| 1907 |
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
|
@@ -2006,17 +1856,24 @@ def redact_text_pdf(
|
|
| 2006 |
text_container_analyser_results.extend(text_line_analyser_result)
|
| 2007 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
| 2008 |
|
| 2009 |
-
print("text_container_analyser_results:", text_container_analyser_results)
|
| 2010 |
|
|
|
|
| 2011 |
page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
| 2012 |
|
| 2013 |
|
| 2014 |
-
print("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2015 |
|
| 2016 |
# Annotate redactions on page
|
| 2017 |
-
|
| 2018 |
|
| 2019 |
-
print("
|
| 2020 |
|
| 2021 |
# Make pymupdf page redactions
|
| 2022 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
|
@@ -2025,7 +1882,9 @@ def redact_text_pdf(
|
|
| 2025 |
else: redact_whole_page = False
|
| 2026 |
else: redact_whole_page = False
|
| 2027 |
|
| 2028 |
-
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page,
|
|
|
|
|
|
|
| 2029 |
|
| 2030 |
#print("Did redact_page_with_pymupdf function")
|
| 2031 |
reported_page_no = page_no + 1
|
|
@@ -2037,6 +1896,7 @@ def redact_text_pdf(
|
|
| 2037 |
|
| 2038 |
if not decision_process_table_on_page.empty:
|
| 2039 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
|
|
|
| 2040 |
|
| 2041 |
if not page_text_outputs.empty:
|
| 2042 |
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
|
|
|
| 40 |
max_time_value = get_or_create_env_var('max_time_value', '999999')
|
| 41 |
print(f'The value of max_time_value is {max_time_value}')
|
| 42 |
|
| 43 |
+
def bounding_boxes_overlap(box1, box2):
|
| 44 |
+
"""Check if two bounding boxes overlap."""
|
| 45 |
+
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
| 46 |
+
box1[1] < box2[3] and box2[1] < box1[3])
|
| 47 |
+
|
| 48 |
def sum_numbers_before_seconds(string:str):
|
| 49 |
"""Extracts numbers that precede the word 'seconds' from a string and adds them up.
|
| 50 |
|
|
|
|
| 401 |
# Convert json to csv and also save this
|
| 402 |
#print("annotations_all_pages:", annotations_all_pages)
|
| 403 |
|
| 404 |
+
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
| 405 |
|
| 406 |
out_review_file_file_path = out_image_file_path + '_review_file.csv'
|
| 407 |
review_df.to_csv(out_review_file_file_path, index=None)
|
|
|
|
| 457 |
|
| 458 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
| 459 |
|
| 460 |
+
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
|
| 461 |
'''
|
| 462 |
Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
|
| 463 |
'''
|
|
|
|
| 479 |
x_diff_ratio = media_reference_x_diff / reference_box_width
|
| 480 |
|
| 481 |
# Extract the annotation rectangle field
|
| 482 |
+
if type=="pikepdf_annot":
|
| 483 |
+
rect_field = pikepdf_bbox["/Rect"]
|
| 484 |
+
else:
|
| 485 |
+
rect_field = pikepdf_bbox
|
| 486 |
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
|
| 487 |
|
| 488 |
# Unpack coordinates
|
|
|
|
| 495 |
|
| 496 |
return new_x1, new_y1, new_x2, new_y2
|
| 497 |
|
| 498 |
+
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image, type="pikepdf_annot"):
|
| 499 |
'''
|
| 500 |
Convert annotations from pikepdf coordinates to image coordinates.
|
| 501 |
'''
|
|
|
|
| 512 |
scale_height = image_page_height / rect_height
|
| 513 |
|
| 514 |
# Extract the /Rect field
|
| 515 |
+
if type=="pikepdf_annot":
|
| 516 |
+
rect_field = annot["/Rect"]
|
| 517 |
+
else:
|
| 518 |
+
rect_field = annot
|
| 519 |
|
| 520 |
# Convert the extracted /Rect field to a list of floats
|
| 521 |
rect_coordinates = [float(coord) for coord in rect_field]
|
|
|
|
| 529 |
|
| 530 |
return x1_image, new_y1_image, x2_image, new_y2_image
|
| 531 |
|
| 532 |
+
def convert_pikepdf_decision_output_to_image_coords(pymupdf_page, pikepdf_decision_ouput_data:List, image):
|
| 533 |
+
if isinstance(image, str):
|
| 534 |
+
image_path = image
|
| 535 |
+
image = Image.open(image_path)
|
| 536 |
+
|
| 537 |
+
# Loop through each item in the data
|
| 538 |
+
for item in pikepdf_decision_ouput_data:
|
| 539 |
+
# Extract the bounding box
|
| 540 |
+
bounding_box = item['boundingBox']
|
| 541 |
+
|
| 542 |
+
# Create a pikepdf_bbox dictionary to match the expected input
|
| 543 |
+
pikepdf_bbox = {"/Rect": bounding_box}
|
| 544 |
+
|
| 545 |
+
# Call the conversion function
|
| 546 |
+
new_x1, new_y1, new_x2, new_y2 = convert_pikepdf_to_image_coords(pymupdf_page, pikepdf_bbox, image, type="pikepdf_annot")
|
| 547 |
+
|
| 548 |
+
# Update the original object with the new bounding box values
|
| 549 |
+
item['boundingBox'] = [new_x1, new_y1, new_x2, new_y2]
|
| 550 |
+
|
| 551 |
+
return pikepdf_decision_ouput_data
|
| 552 |
+
|
| 553 |
+
def convert_image_coords_to_pymupdf(pymupdf_page, annot, image:Image, type="image_recognizer"):
|
| 554 |
'''
|
| 555 |
+
Converts an image with redaction coordinates from a CustomImageRecognizerResult or pikepdf object with image coordinates to pymupdf coordinates.
|
| 556 |
'''
|
| 557 |
|
| 558 |
rect_height = pymupdf_page.rect.height
|
|
|
|
| 565 |
scale_height = rect_height / image_page_height
|
| 566 |
|
| 567 |
# Calculate scaled coordinates
|
| 568 |
+
if type == "image_recognizer":
|
| 569 |
+
x1 = (annot.left * scale_width)# + page_x_adjust
|
| 570 |
+
new_y1 = (annot.top * scale_height)# - page_y_adjust # Flip Y0 (since it starts from bottom)
|
| 571 |
+
x2 = ((annot.left + annot.width) * scale_width)# + page_x_adjust # Calculate x1
|
| 572 |
+
new_y2 = ((annot.top + annot.height) * scale_height)# - page_y_adjust # Calculate y1 correctly
|
| 573 |
+
# Else assume it is a pikepdf derived object
|
| 574 |
+
else:
|
| 575 |
+
rect_field = annot["/Rect"]
|
| 576 |
+
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
|
| 577 |
|
| 578 |
+
# Unpack coordinates
|
| 579 |
+
x1, y1, x2, y2 = rect_coordinates
|
| 580 |
+
|
| 581 |
+
#print("scale_width:", scale_width)
|
| 582 |
+
#print("scale_height:", scale_height)
|
| 583 |
|
| 584 |
+
x1 = (x1* scale_width)# + page_x_adjust
|
| 585 |
+
new_y1 = ((y2 + (y1 - y2))* scale_height)# - page_y_adjust # Calculate y1 correctly
|
| 586 |
+
x2 = ((x1 + (x2 - x1)) * scale_width)# + page_x_adjust # Calculate x1
|
| 587 |
+
new_y2 = (y2 * scale_height)# - page_y_adjust # Flip Y0 (since it starts from bottom)
|
| 588 |
+
|
| 589 |
|
| 590 |
+
return x1, new_y1, x2, new_y2
|
| 591 |
|
| 592 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
| 593 |
'''
|
|
|
|
| 624 |
|
| 625 |
return new_file_path
|
| 626 |
|
| 627 |
+
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_coords:bool=True):
|
| 628 |
|
| 629 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
| 630 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
|
|
|
| 646 |
image = Image.open(image_path)
|
| 647 |
|
| 648 |
# Check if this is an object used in the Gradio Annotation component
|
| 649 |
+
if isinstance (page_annotations, dict):
|
| 650 |
+
page_annotations = page_annotations["boxes"]
|
| 651 |
|
| 652 |
+
for annot in page_annotations:
|
| 653 |
# Check if an Image recogniser result, or a Gradio annotation object
|
| 654 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
| 655 |
|
|
|
|
| 658 |
# Should already be in correct format if img_annotator_box is an input
|
| 659 |
if isinstance(annot, dict):
|
| 660 |
img_annotation_box = annot
|
|
|
|
| 661 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
|
| 662 |
|
| 663 |
x1 = pymupdf_x1
|
| 664 |
x2 = pymupdf_x2
|
| 665 |
|
| 666 |
+
# if hasattr(annot, 'text') and annot.text:
|
| 667 |
+
# img_annotation_box["text"] = annot.text
|
| 668 |
+
# else:
|
| 669 |
+
# img_annotation_box["text"] = ""
|
| 670 |
+
|
| 671 |
# Else should be CustomImageRecognizerResult
|
| 672 |
else:
|
| 673 |
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
|
|
|
|
| 684 |
img_annotation_box["label"] = annot.entity_type
|
| 685 |
except:
|
| 686 |
img_annotation_box["label"] = "Redaction"
|
| 687 |
+
# if hasattr(annot, 'text') and annot.text:
|
| 688 |
+
# img_annotation_box["text"] = annot.text
|
| 689 |
+
# else:
|
| 690 |
+
# img_annotation_box["text"] = ""
|
| 691 |
|
| 692 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
| 693 |
|
| 694 |
# Else it should be a pikepdf annotation object
|
| 695 |
+
else:
|
| 696 |
+
if convert_coords == True:
|
| 697 |
+
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
|
| 698 |
+
else:
|
| 699 |
+
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image, type="pikepdf_image_coords")
|
| 700 |
|
| 701 |
x1 = pymupdf_x1
|
| 702 |
x2 = pymupdf_x2
|
|
|
|
| 708 |
if image:
|
| 709 |
img_width, img_height = image.size
|
| 710 |
|
| 711 |
+
print("annot:", annot)
|
| 712 |
+
|
| 713 |
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
|
| 714 |
|
| 715 |
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
|
|
|
|
| 722 |
img_annotation_box["label"] = str(annot["/T"])
|
| 723 |
else:
|
| 724 |
img_annotation_box["label"] = "REDACTION"
|
| 725 |
+
# if hasattr(annot, 'text') and annot.text:
|
| 726 |
+
# img_annotation_box["text"] = annot.text
|
| 727 |
+
# else:
|
| 728 |
+
# img_annotation_box["text"] = ""
|
| 729 |
|
| 730 |
# Convert to a PyMuPDF Rect object
|
| 731 |
#rect = Rect(rect_coordinates)
|
|
|
|
| 736 |
|
| 737 |
# If whole page is to be redacted, do that here
|
| 738 |
if redact_whole_page == True:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 739 |
|
| 740 |
whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5)
|
| 741 |
all_image_annotation_boxes.append(whole_page_img_annotation_box)
|
|
|
|
| 753 |
|
| 754 |
return page, out_annotation_boxes
|
| 755 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 756 |
|
|
|
|
|
|
|
|
|
|
| 757 |
|
| 758 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
| 759 |
|
|
|
|
| 856 |
unique_bboxes = list({(bbox.left, bbox.top, bbox.width, bbox.height): bbox for bbox in all_bboxes}.values())
|
| 857 |
return unique_bboxes
|
| 858 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 859 |
def redact_image_pdf(file_path:str,
|
| 860 |
prepared_pdf_file_paths:List[str],
|
| 861 |
language:str,
|
|
|
|
| 1202 |
|
| 1203 |
# Convert decision process to table
|
| 1204 |
decision_process_table = pd.DataFrame([{
|
| 1205 |
+
'text': result.text,
|
| 1206 |
+
'xmin': result.left,
|
| 1207 |
+
'ymin': result.top,
|
| 1208 |
+
'xmax': result.left + result.width,
|
| 1209 |
+
'ymax': result.top + result.height,
|
| 1210 |
+
'label': result.entity_type,
|
| 1211 |
'start': result.start,
|
| 1212 |
'end': result.end,
|
| 1213 |
'score': result.score,
|
| 1214 |
+
'page': reported_page_number
|
| 1215 |
+
|
| 1216 |
+
} for result in merged_redaction_bboxes]) #'left': result.left,
|
| 1217 |
+
#'top': result.top,
|
| 1218 |
+
#'width': result.width,
|
| 1219 |
+
#'height': result.height,
|
| 1220 |
|
| 1221 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table])
|
| 1222 |
|
|
|
|
| 1250 |
pymupdf_doc = images
|
| 1251 |
|
| 1252 |
# Check if the image already exists in annotations_all_pages
|
| 1253 |
+
#print("annotations_all_pages:", annotations_all_pages)
|
| 1254 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
| 1255 |
if existing_index is not None:
|
| 1256 |
# Replace the existing annotation
|
|
|
|
| 1273 |
pymupdf_doc = images
|
| 1274 |
|
| 1275 |
# Check if the image already exists in annotations_all_pages
|
| 1276 |
+
#print("annotations_all_pages:", annotations_all_pages)
|
| 1277 |
existing_index = next((index for index, ann in enumerate(annotations_all_pages) if ann["image"] == image_annotations["image"]), None)
|
| 1278 |
if existing_index is not None:
|
| 1279 |
# Replace the existing annotation
|
|
|
|
| 1522 |
|
| 1523 |
return analysed_bounding_boxes
|
| 1524 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1525 |
def create_text_redaction_process_results(analyser_results, analysed_bounding_boxes, page_num):
|
| 1526 |
decision_process_table = pd.DataFrame()
|
| 1527 |
|
| 1528 |
if len(analyser_results) > 0:
|
| 1529 |
# Create summary df of annotations to be made
|
| 1530 |
analysed_bounding_boxes_df_new = pd.DataFrame(analysed_bounding_boxes)
|
| 1531 |
+
|
| 1532 |
+
# Remove brackets and split the string into four separate columns
|
| 1533 |
+
#print("analysed_bounding_boxes_df_new:", analysed_bounding_boxes_df_new['boundingBox'])
|
| 1534 |
+
# analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].str.strip('[]').str.split(',', expand=True)
|
| 1535 |
+
|
| 1536 |
+
# Split the boundingBox list into four separate columns
|
| 1537 |
+
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
| 1538 |
+
|
| 1539 |
+
# Convert the new columns to integers (if needed)
|
| 1540 |
+
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float)
|
| 1541 |
+
|
| 1542 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
| 1543 |
+
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
| 1544 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
| 1545 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
| 1546 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
|
|
|
| 1549 |
|
| 1550 |
return decision_process_table
|
| 1551 |
|
| 1552 |
+
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
| 1553 |
+
pikepdf_annotations_on_page = []
|
| 1554 |
for analysed_bounding_box in analysed_bounding_boxes:
|
| 1555 |
bounding_box = analysed_bounding_box["boundingBox"]
|
| 1556 |
annotation = Dictionary(
|
|
|
|
| 1568 |
S=Name.S # Border style: solid
|
| 1569 |
)
|
| 1570 |
)
|
| 1571 |
+
pikepdf_annotations_on_page.append(annotation)
|
| 1572 |
+
return pikepdf_annotations_on_page
|
| 1573 |
|
| 1574 |
def redact_text_pdf(
|
| 1575 |
filename: str, # Path to the PDF file to be redacted
|
|
|
|
| 1687 |
|
| 1688 |
if page_min <= page_no < page_max:
|
| 1689 |
|
| 1690 |
+
if isinstance(image, str):
|
| 1691 |
+
image_path = image
|
| 1692 |
+
image = Image.open(image_path)
|
| 1693 |
+
|
| 1694 |
for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
|
| 1695 |
|
| 1696 |
page_analyser_results = []
|
| 1697 |
page_analysed_bounding_boxes = []
|
| 1698 |
|
| 1699 |
characters = []
|
| 1700 |
+
pikepdf_annotations_on_page = []
|
| 1701 |
decision_process_table_on_page = pd.DataFrame()
|
| 1702 |
page_text_outputs = pd.DataFrame()
|
| 1703 |
|
|
|
|
| 1751 |
)
|
| 1752 |
all_text_line_results.append((i, text_line_analyser_result))
|
| 1753 |
|
| 1754 |
+
|
|
|
|
| 1755 |
elif pii_identification_method == "AWS Comprehend":
|
| 1756 |
|
| 1757 |
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
|
|
|
| 1856 |
text_container_analyser_results.extend(text_line_analyser_result)
|
| 1857 |
text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
| 1858 |
|
| 1859 |
+
#print("text_container_analyser_results:", text_container_analyser_results)
|
| 1860 |
|
| 1861 |
+
page_analyser_results.extend(text_container_analyser_results) # Add this line
|
| 1862 |
page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
| 1863 |
|
| 1864 |
|
| 1865 |
+
#print("page_analyser_results:", page_analyser_results)
|
| 1866 |
+
#print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
| 1867 |
+
#print("image:", image)
|
| 1868 |
+
|
| 1869 |
+
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
| 1870 |
+
|
| 1871 |
+
#print("page_analysed_bounding_boxes_out_converted:", page_analysed_bounding_boxes)
|
| 1872 |
|
| 1873 |
# Annotate redactions on page
|
| 1874 |
+
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
| 1875 |
|
| 1876 |
+
#print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
|
| 1877 |
|
| 1878 |
# Make pymupdf page redactions
|
| 1879 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
|
|
|
| 1882 |
else: redact_whole_page = False
|
| 1883 |
else: redact_whole_page = False
|
| 1884 |
|
| 1885 |
+
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
|
| 1886 |
+
|
| 1887 |
+
#print("image_annotations:", image_annotations)
|
| 1888 |
|
| 1889 |
#print("Did redact_page_with_pymupdf function")
|
| 1890 |
reported_page_no = page_no + 1
|
|
|
|
| 1896 |
|
| 1897 |
if not decision_process_table_on_page.empty:
|
| 1898 |
all_decision_process_table = pd.concat([all_decision_process_table, decision_process_table_on_page])
|
| 1899 |
+
#print("all_decision_process_table:", all_decision_process_table)
|
| 1900 |
|
| 1901 |
if not page_text_outputs.empty:
|
| 1902 |
page_text_outputs = page_text_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
tools/redaction_review.py
CHANGED
|
@@ -68,6 +68,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
| 68 |
#print("review_dataframe['label']", review_dataframe["label"])
|
| 69 |
recogniser_entities = review_dataframe["label"].unique().tolist()
|
| 70 |
recogniser_entities.append("ALL")
|
|
|
|
| 71 |
|
| 72 |
#print("recogniser_entities:", recogniser_entities)
|
| 73 |
|
|
@@ -187,7 +188,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
| 187 |
|
| 188 |
return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
|
| 189 |
|
| 190 |
-
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], clear_all:bool=False):
|
| 191 |
'''
|
| 192 |
Overwrite current image annotations with modifications
|
| 193 |
'''
|
|
@@ -198,6 +199,8 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
| 198 |
#If no previous page or is 0, i.e. first time run, then rewrite current page
|
| 199 |
#if not previous_page:
|
| 200 |
# previous_page = current_page
|
|
|
|
|
|
|
| 201 |
|
| 202 |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
|
| 203 |
|
|
@@ -206,9 +209,26 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
|
|
| 206 |
else:
|
| 207 |
all_image_annotations[previous_page - 1]["boxes"] = []
|
| 208 |
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
'''
|
| 213 |
Apply modified redactions to a pymupdf and export review files
|
| 214 |
'''
|
|
@@ -302,7 +322,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
| 302 |
output_files.append(out_pdf_file_path)
|
| 303 |
|
| 304 |
try:
|
| 305 |
-
|
| 306 |
|
| 307 |
out_annotation_file_path = output_folder + file_base + '_review_file.json'
|
| 308 |
with open(out_annotation_file_path, 'w') as f:
|
|
@@ -311,14 +331,16 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
| 311 |
|
| 312 |
print("Saving annotations to CSV review file")
|
| 313 |
|
|
|
|
|
|
|
| 314 |
# Convert json to csv and also save this
|
| 315 |
-
review_df = convert_review_json_to_pandas_df(all_image_annotations)
|
| 316 |
out_review_file_file_path = output_folder + file_base + '_review_file.csv'
|
| 317 |
review_df.to_csv(out_review_file_file_path, index=None)
|
| 318 |
output_files.append(out_review_file_file_path)
|
| 319 |
|
| 320 |
except Exception as e:
|
| 321 |
-
print("Could not save annotations to json file:", e)
|
| 322 |
|
| 323 |
return doc, all_image_annotations, output_files, output_log_files
|
| 324 |
|
|
|
|
| 68 |
#print("review_dataframe['label']", review_dataframe["label"])
|
| 69 |
recogniser_entities = review_dataframe["label"].unique().tolist()
|
| 70 |
recogniser_entities.append("ALL")
|
| 71 |
+
recogniser_entities = sorted(recogniser_entities)
|
| 72 |
|
| 73 |
#print("recogniser_entities:", recogniser_entities)
|
| 74 |
|
|
|
|
| 188 |
|
| 189 |
return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
|
| 190 |
|
| 191 |
+
def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True),recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), clear_all:bool=False):
|
| 192 |
'''
|
| 193 |
Overwrite current image annotations with modifications
|
| 194 |
'''
|
|
|
|
| 199 |
#If no previous page or is 0, i.e. first time run, then rewrite current page
|
| 200 |
#if not previous_page:
|
| 201 |
# previous_page = current_page
|
| 202 |
+
|
| 203 |
+
#print("image_annotated:", image_annotated)
|
| 204 |
|
| 205 |
image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
|
| 206 |
|
|
|
|
| 209 |
else:
|
| 210 |
all_image_annotations[previous_page - 1]["boxes"] = []
|
| 211 |
|
| 212 |
+
#print("all_image_annotations:", all_image_annotations)
|
| 213 |
+
|
| 214 |
+
# Rewrite all_image_annotations search dataframe with latest updates
|
| 215 |
+
try:
|
| 216 |
+
review_dataframe = convert_review_json_to_pandas_df(all_image_annotations)[["page", "label"]]
|
| 217 |
+
#print("review_dataframe['label']", review_dataframe["label"])
|
| 218 |
+
recogniser_entities = review_dataframe["label"].unique().tolist()
|
| 219 |
+
recogniser_entities.append("ALL")
|
| 220 |
+
recogniser_entities = sorted(recogniser_entities)
|
| 221 |
|
| 222 |
+
recogniser_dataframe_out = gr.Dataframe(review_dataframe)
|
| 223 |
+
#recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
|
| 224 |
+
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_drop, choices=recogniser_entities, allow_custom_value=True, interactive=True)
|
| 225 |
+
except Exception as e:
|
| 226 |
+
print("Could not extract recogniser information:", e)
|
| 227 |
+
recogniser_dataframe_out = recogniser_dataframe
|
| 228 |
+
|
| 229 |
+
return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
|
| 230 |
+
|
| 231 |
+
def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, progress=gr.Progress(track_tqdm=True)):
|
| 232 |
'''
|
| 233 |
Apply modified redactions to a pymupdf and export review files
|
| 234 |
'''
|
|
|
|
| 322 |
output_files.append(out_pdf_file_path)
|
| 323 |
|
| 324 |
try:
|
| 325 |
+
print("Saving annotations to JSON")
|
| 326 |
|
| 327 |
out_annotation_file_path = output_folder + file_base + '_review_file.json'
|
| 328 |
with open(out_annotation_file_path, 'w') as f:
|
|
|
|
| 331 |
|
| 332 |
print("Saving annotations to CSV review file")
|
| 333 |
|
| 334 |
+
print("review_file_state:", review_file_state)
|
| 335 |
+
|
| 336 |
# Convert json to csv and also save this
|
| 337 |
+
review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
|
| 338 |
out_review_file_file_path = output_folder + file_base + '_review_file.csv'
|
| 339 |
review_df.to_csv(out_review_file_file_path, index=None)
|
| 340 |
output_files.append(out_review_file_file_path)
|
| 341 |
|
| 342 |
except Exception as e:
|
| 343 |
+
print("Could not save annotations to json or csv file:", e)
|
| 344 |
|
| 345 |
return doc, all_image_annotations, output_files, output_log_files
|
| 346 |
|