Commit
·
ec98119
1
Parent(s):
c71d0c1
Comprehend now uses custom spacy recognisers on top of defaults. Added zoom functionality to annotator. Fixed some pdf mediabox issues and redacted image output issues.
Browse files- app.py +37 -14
- tools/aws_functions.py +1 -1
- tools/custom_image_analyser_engine.py +10 -0
- tools/file_conversion.py +6 -4
- tools/file_redaction.py +126 -46
- tools/helper_functions.py +12 -1
- tools/load_spacy_model_custom_recognisers.py +1 -0
- tools/redaction_review.py +16 -3
app.py
CHANGED
|
@@ -13,9 +13,10 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
|
|
| 13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
| 14 |
from tools.file_redaction import choose_and_run_redactor
|
| 15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
| 16 |
-
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator
|
| 17 |
from tools.data_anonymise import anonymise_data_files
|
| 18 |
from tools.auth import authenticate_user
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
today_rev = datetime.now().strftime("%Y%m%d")
|
|
@@ -29,6 +30,10 @@ chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT
|
|
| 29 |
|
| 30 |
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
| 33 |
|
| 34 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
|
@@ -117,6 +122,12 @@ with app:
|
|
| 117 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
| 118 |
|
| 119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
###
|
| 121 |
# UI DESIGN
|
| 122 |
###
|
|
@@ -164,6 +175,9 @@ with app:
|
|
| 164 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
| 165 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
| 166 |
annotation_next_page_button = gr.Button("Next page", scale = 3)
|
|
|
|
|
|
|
|
|
|
| 167 |
|
| 168 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
| 169 |
|
|
@@ -238,9 +252,9 @@ with app:
|
|
| 238 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
| 239 |
|
| 240 |
with gr.Accordion("Add or remove entity types to redact", open = False):
|
| 241 |
-
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
| 242 |
-
|
| 243 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
|
|
|
|
|
|
|
| 244 |
|
| 245 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
| 246 |
#with gr.Row():
|
|
@@ -260,18 +274,19 @@ with app:
|
|
| 260 |
###
|
| 261 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
|
| 262 |
|
| 263 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox]).\
|
| 264 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
| 265 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 266 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc")
|
| 267 |
-
|
| 268 |
|
| 269 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
| 270 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 271 |
-
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number])
|
|
|
|
| 272 |
|
| 273 |
# If a file has been completed, the function will continue onto the next document
|
| 274 |
-
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
|
| 275 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
| 276 |
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
|
| 277 |
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
|
@@ -284,12 +299,20 @@ with app:
|
|
| 284 |
# Page controls at top
|
| 285 |
annotate_current_page.submit(
|
| 286 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 287 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 288 |
|
| 289 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 290 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 291 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 292 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
| 295 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
|
@@ -297,12 +320,12 @@ with app:
|
|
| 297 |
# Page controls at bottom
|
| 298 |
annotate_current_page_bottom.submit(
|
| 299 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
| 300 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 301 |
|
| 302 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 303 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 304 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 305 |
-
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 306 |
|
| 307 |
###
|
| 308 |
# TABULAR DATA REDACTION
|
|
|
|
| 13 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
| 14 |
from tools.file_redaction import choose_and_run_redactor
|
| 15 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
|
| 16 |
+
from tools.redaction_review import apply_redactions, crop, get_boxes_json, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom
|
| 17 |
from tools.data_anonymise import anonymise_data_files
|
| 18 |
from tools.auth import authenticate_user
|
| 19 |
+
from tools.load_spacy_model_custom_recognisers import custom_entities
|
| 20 |
|
| 21 |
|
| 22 |
today_rev = datetime.now().strftime("%Y%m%d")
|
|
|
|
| 30 |
|
| 31 |
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
|
| 32 |
|
| 33 |
+
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
|
| 34 |
+
chosen_comprehend_entities.extend(custom_entities)
|
| 35 |
+
full_comprehend_entity_list.extend(custom_entities)
|
| 36 |
+
|
| 37 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE"]
|
| 38 |
|
| 39 |
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
|
|
|
|
| 122 |
default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
|
| 123 |
|
| 124 |
|
| 125 |
+
## Annotator zoom value
|
| 126 |
+
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
|
| 127 |
+
zoom_true_bool = gr.State(True)
|
| 128 |
+
zoom_false_bool = gr.State(False)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
###
|
| 132 |
# UI DESIGN
|
| 133 |
###
|
|
|
|
| 175 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
| 176 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
| 177 |
annotation_next_page_button = gr.Button("Next page", scale = 3)
|
| 178 |
+
with gr.Row():
|
| 179 |
+
annotate_zoom_in = gr.Button("Zoom in")
|
| 180 |
+
annotate_zoom_out = gr.Button("Zoom out")
|
| 181 |
|
| 182 |
annotation_button_apply = gr.Button("Apply revised redactions", variant="primary")
|
| 183 |
|
|
|
|
| 252 |
in_allow_list_text = gr.Textbox(label="Custom allow list load status")
|
| 253 |
|
| 254 |
with gr.Accordion("Add or remove entity types to redact", open = False):
|
|
|
|
|
|
|
| 255 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="Entities to redact - AWS Comprehend PII identification model (click close to down arrow for full list)")
|
| 256 |
+
|
| 257 |
+
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Entities to redact - local PII identification model (click close to down arrow for full list)")
|
| 258 |
|
| 259 |
handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract settings", choices=["Redact all identified handwriting", "Redact all identified signatures"], value=["Redact all identified handwriting", "Redact all identified signatures"])
|
| 260 |
#with gr.Row():
|
|
|
|
| 274 |
###
|
| 275 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_textbox, doc_file_name_with_extension_textbox])
|
| 276 |
|
| 277 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
| 278 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state], api_name="prepare_doc").\
|
| 279 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 280 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
| 281 |
+
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
|
| 282 |
|
| 283 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
| 284 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 285 |
+
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number]).\
|
| 286 |
+
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom])
|
| 287 |
|
| 288 |
# If a file has been completed, the function will continue onto the next document
|
| 289 |
+
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom]).\
|
| 290 |
then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
| 291 |
# latest_file_completed_text.change(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state]).\
|
| 292 |
# then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return],
|
|
|
|
| 299 |
# Page controls at top
|
| 300 |
annotate_current_page.submit(
|
| 301 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 302 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 303 |
|
| 304 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 305 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 306 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 307 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 308 |
+
|
| 309 |
+
# Zoom in and out on annotator
|
| 310 |
+
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 311 |
+
then(update_zoom, inputs=[annotator_zoom_number, zoom_true_bool], outputs=[annotator_zoom_number]).\
|
| 312 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 313 |
+
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
| 314 |
+
then(update_zoom, inputs=[annotator_zoom_number, zoom_false_bool], outputs=[annotator_zoom_number]).\
|
| 315 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 316 |
|
| 317 |
#annotation_button_get.click(get_boxes_json, annotator, json_boxes)
|
| 318 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, in_doc_files, pdf_doc_state, all_image_annotations_state, annotate_current_page], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files], scroll_to_output=True)
|
|
|
|
| 320 |
# Page controls at bottom
|
| 321 |
annotate_current_page_bottom.submit(
|
| 322 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
|
| 323 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 324 |
|
| 325 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 326 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 327 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
| 328 |
+
then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom])
|
| 329 |
|
| 330 |
###
|
| 331 |
# TABULAR DATA REDACTION
|
tools/aws_functions.py
CHANGED
|
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
| 10 |
# Get AWS credentials
|
| 11 |
bucket_name=""
|
| 12 |
|
| 13 |
-
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "
|
| 14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
| 15 |
|
| 16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
|
|
|
| 10 |
# Get AWS credentials
|
| 11 |
bucket_name=""
|
| 12 |
|
| 13 |
+
RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "1")
|
| 14 |
print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
|
| 15 |
|
| 16 |
AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
|
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -13,6 +13,7 @@ from copy import deepcopy
|
|
| 13 |
from tools.helper_functions import clean_unicode_text
|
| 14 |
from tools.aws_functions import comprehend_client
|
| 15 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
|
|
|
| 16 |
#import string # Import string to get a list of common punctuation characters
|
| 17 |
|
| 18 |
@dataclass
|
|
@@ -491,6 +492,14 @@ class CustomImageAnalyzerEngine:
|
|
| 491 |
analyzer_results_by_line[i] = analyzer_result
|
| 492 |
|
| 493 |
elif pii_identification_method == "AWS Comprehend":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 494 |
if len(line_level_ocr_result.text) >= 3:
|
| 495 |
# Add line to current batch with a separator
|
| 496 |
if current_batch:
|
|
@@ -509,6 +518,7 @@ class CustomImageAnalyzerEngine:
|
|
| 509 |
Text=current_batch,
|
| 510 |
LanguageCode=text_analyzer_kwargs["language"]
|
| 511 |
)
|
|
|
|
| 512 |
except Exception as e:
|
| 513 |
print(e)
|
| 514 |
time.sleep(3)
|
|
|
|
| 13 |
from tools.helper_functions import clean_unicode_text
|
| 14 |
from tools.aws_functions import comprehend_client
|
| 15 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
| 16 |
+
from tools.load_spacy_model_custom_recognisers import custom_entities
|
| 17 |
#import string # Import string to get a list of common punctuation characters
|
| 18 |
|
| 19 |
@dataclass
|
|
|
|
| 492 |
analyzer_results_by_line[i] = analyzer_result
|
| 493 |
|
| 494 |
elif pii_identification_method == "AWS Comprehend":
|
| 495 |
+
|
| 496 |
+
# If using AWS Comprehend, Spacy model is only used to identify the custom entities created. Comprehend can't pick up Titles, Streetnames, and UKPostcodes specifically
|
| 497 |
+
text_analyzer_kwargs["entities"] = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
| 498 |
+
|
| 499 |
+
spacy_analyzer_result = self.analyzer_engine.analyze(
|
| 500 |
+
text=line_level_ocr_result.text, **text_analyzer_kwargs)
|
| 501 |
+
analyzer_results_by_line[i].extend(spacy_analyzer_result)
|
| 502 |
+
|
| 503 |
if len(line_level_ocr_result.text) >= 3:
|
| 504 |
# Add line to current batch with a separator
|
| 505 |
if current_batch:
|
|
|
|
| 518 |
Text=current_batch,
|
| 519 |
LanguageCode=text_analyzer_kwargs["language"]
|
| 520 |
)
|
| 521 |
+
|
| 522 |
except Exception as e:
|
| 523 |
print(e)
|
| 524 |
time.sleep(3)
|
tools/file_conversion.py
CHANGED
|
@@ -11,6 +11,8 @@ import pymupdf
|
|
| 11 |
from gradio import Progress
|
| 12 |
from typing import List, Optional
|
| 13 |
|
|
|
|
|
|
|
| 14 |
def is_pdf_or_image(filename):
|
| 15 |
"""
|
| 16 |
Check if a file name is a PDF or an image file.
|
|
@@ -42,7 +44,7 @@ def is_pdf(filename):
|
|
| 42 |
# %%
|
| 43 |
## Convert pdf to image if necessary
|
| 44 |
|
| 45 |
-
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(track_tqdm=True)):
|
| 46 |
|
| 47 |
# Get the number of pages in the PDF
|
| 48 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
|
@@ -70,7 +72,7 @@ def convert_pdf_to_images(pdf_path:str, page_min:int = 0, progress=Progress(trac
|
|
| 70 |
|
| 71 |
|
| 72 |
else:
|
| 73 |
-
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=
|
| 74 |
|
| 75 |
image = image_l[0]
|
| 76 |
|
|
@@ -334,7 +336,7 @@ def prepare_image_or_pdf(
|
|
| 334 |
|
| 335 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
| 336 |
|
| 337 |
-
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
| 338 |
file_path_without_ext = get_file_path_end(in_file_path)
|
| 339 |
|
| 340 |
out_file_paths = out_text_file_path
|
|
@@ -344,7 +346,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
| 344 |
|
| 345 |
pdf_text_image_paths = process_file(out_text_file_path[0])
|
| 346 |
out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
|
| 347 |
-
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=
|
| 348 |
|
| 349 |
# out_file_paths.append(out_text_image_file_path)
|
| 350 |
|
|
|
|
| 11 |
from gradio import Progress
|
| 12 |
from typing import List, Optional
|
| 13 |
|
| 14 |
+
image_dpi = 300.0
|
| 15 |
+
|
| 16 |
def is_pdf_or_image(filename):
|
| 17 |
"""
|
| 18 |
Check if a file name is a PDF or an image file.
|
|
|
|
| 44 |
# %%
|
| 45 |
## Convert pdf to image if necessary
|
| 46 |
|
| 47 |
+
def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
| 48 |
|
| 49 |
# Get the number of pages in the PDF
|
| 50 |
page_count = pdfinfo_from_path(pdf_path)['Pages']
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
else:
|
| 75 |
+
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
| 76 |
|
| 77 |
image = image_l[0]
|
| 78 |
|
|
|
|
| 336 |
|
| 337 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc
|
| 338 |
|
| 339 |
+
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
| 340 |
file_path_without_ext = get_file_path_end(in_file_path)
|
| 341 |
|
| 342 |
out_file_paths = out_text_file_path
|
|
|
|
| 346 |
|
| 347 |
pdf_text_image_paths = process_file(out_text_file_path[0])
|
| 348 |
out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
|
| 349 |
+
pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
|
| 350 |
|
| 351 |
# out_file_paths.append(out_text_image_file_path)
|
| 352 |
|
tools/file_redaction.py
CHANGED
|
@@ -27,8 +27,8 @@ from collections import defaultdict # For efficient grouping
|
|
| 27 |
from presidio_analyzer import RecognizerResult
|
| 28 |
|
| 29 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
| 30 |
-
from tools.file_conversion import process_file
|
| 31 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
| 32 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
|
| 33 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
| 34 |
# from tools.data_anonymise import generate_decision_process_output
|
|
@@ -314,8 +314,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 314 |
|
| 315 |
# Save file
|
| 316 |
if is_pdf(file_path) == False:
|
| 317 |
-
out_image_file_path = output_folder + file_path_without_ext + "
|
| 318 |
-
pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=
|
| 319 |
|
| 320 |
else:
|
| 321 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
|
|
@@ -413,35 +413,40 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 413 |
|
| 414 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
| 415 |
|
| 416 |
-
def
|
| 417 |
'''
|
| 418 |
-
Convert annotations from pikepdf to pymupdf format
|
| 419 |
'''
|
|
|
|
|
|
|
|
|
|
| 420 |
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
|
|
|
|
|
|
| 425 |
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
page_y_adjust = (rect_height - mediabox_height) / 2 # Center adjustment
|
| 429 |
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
rect_coordinates = [float(coord) for coord in rect_field]
|
| 436 |
|
| 437 |
-
#
|
| 438 |
x1, y1, x2, y2 = rect_coordinates
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
|
|
|
| 445 |
|
| 446 |
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
| 447 |
'''
|
|
@@ -496,6 +501,64 @@ def convert_image_coords_to_pymupdf(pymupdf_page, annot:CustomImageRecognizerRes
|
|
| 496 |
|
| 497 |
return x1, new_y1, x2, new_y2
|
| 498 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 499 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
| 500 |
'''
|
| 501 |
Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
|
|
@@ -587,25 +650,25 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
|
|
| 587 |
|
| 588 |
# Else it should be a pikepdf annotation object
|
| 589 |
else:
|
| 590 |
-
x1, pymupdf_y1, x2, pymupdf_y2 =
|
| 591 |
|
| 592 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
|
| 593 |
|
| 594 |
img_annotation_box = {}
|
| 595 |
|
| 596 |
if image:
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
img_annotation_box["
|
| 602 |
-
img_annotation_box["
|
| 603 |
-
img_annotation_box["
|
|
|
|
|
|
|
| 604 |
|
| 605 |
if isinstance(annot, Dictionary):
|
| 606 |
-
#print("Trying to get label out of annotation", annot["/T"])
|
| 607 |
img_annotation_box["label"] = str(annot["/T"])
|
| 608 |
-
#print("Label is:", img_annotation_box["label"])
|
| 609 |
else:
|
| 610 |
img_annotation_box["label"] = "REDACTION"
|
| 611 |
|
|
@@ -646,6 +709,18 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
| 646 |
merged_bboxes = []
|
| 647 |
grouped_bboxes = defaultdict(list)
|
| 648 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 649 |
# Reconstruct bounding boxes for substrings of interest
|
| 650 |
reconstructed_bboxes = []
|
| 651 |
for bbox in bboxes:
|
|
@@ -735,16 +810,6 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
| 735 |
|
| 736 |
merged_bboxes.append(merged_box)
|
| 737 |
|
| 738 |
-
# Process signature and handwriting results
|
| 739 |
-
if signature_recogniser_results or handwriting_recogniser_results:
|
| 740 |
-
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
| 741 |
-
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
| 742 |
-
merged_bboxes.extend(handwriting_recogniser_results)
|
| 743 |
-
|
| 744 |
-
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
| 745 |
-
#print("Signature boxes exist at merge:", signature_recogniser_results)
|
| 746 |
-
merged_bboxes.extend(signature_recogniser_results)
|
| 747 |
-
|
| 748 |
#print("bboxes:", bboxes)
|
| 749 |
|
| 750 |
return merged_bboxes
|
|
@@ -1483,6 +1548,21 @@ def redact_text_pdf(
|
|
| 1483 |
all_text_line_results.append((i, text_line_analyser_result))
|
| 1484 |
|
| 1485 |
elif pii_identification_method == "AWS Comprehend":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1486 |
if len(text_line.text) >= 3:
|
| 1487 |
# Add separator between lines
|
| 1488 |
if current_batch:
|
|
|
|
| 27 |
from presidio_analyzer import RecognizerResult
|
| 28 |
|
| 29 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult
|
| 30 |
+
from tools.file_conversion import process_file, image_dpi
|
| 31 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities
|
| 32 |
from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var
|
| 33 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
| 34 |
# from tools.data_anonymise import generate_decision_process_output
|
|
|
|
| 314 |
|
| 315 |
# Save file
|
| 316 |
if is_pdf(file_path) == False:
|
| 317 |
+
out_image_file_path = output_folder + file_path_without_ext + "_redacted_as_pdf.pdf"
|
| 318 |
+
pymupdf_doc[0].save(out_image_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
| 319 |
|
| 320 |
else:
|
| 321 |
out_image_file_path = output_folder + file_path_without_ext + "_redacted.pdf"
|
|
|
|
| 413 |
|
| 414 |
return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number
|
| 415 |
|
| 416 |
+
def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox):
|
| 417 |
'''
|
| 418 |
+
Convert annotations from pikepdf to pymupdf format, handling the mediabox larger than rect.
|
| 419 |
'''
|
| 420 |
+
# Use cropbox if available, otherwise use mediabox
|
| 421 |
+
reference_box = pymupdf_page.rect
|
| 422 |
+
mediabox = pymupdf_page.mediabox
|
| 423 |
|
| 424 |
+
reference_box_height = reference_box.height
|
| 425 |
+
reference_box_width = reference_box.width
|
| 426 |
+
|
| 427 |
+
# Convert PyMuPDF coordinates back to PDF coordinates (bottom-left origin)
|
| 428 |
+
media_height = mediabox.height
|
| 429 |
+
media_width = mediabox.width
|
| 430 |
|
| 431 |
+
media_reference_y_diff = media_height - reference_box_height
|
| 432 |
+
media_reference_x_diff = media_width - reference_box_width
|
|
|
|
| 433 |
|
| 434 |
+
y_diff_ratio = media_reference_y_diff / reference_box_height
|
| 435 |
+
x_diff_ratio = media_reference_x_diff / reference_box_width
|
| 436 |
+
|
| 437 |
+
# Extract the annotation rectangle field
|
| 438 |
+
rect_field = pikepdf_bbox["/Rect"]
|
| 439 |
+
rect_coordinates = [float(coord) for coord in rect_field] # Convert to floats
|
| 440 |
|
| 441 |
+
# Unpack coordinates
|
| 442 |
x1, y1, x2, y2 = rect_coordinates
|
| 443 |
+
|
| 444 |
+
new_x1 = x1 - (media_reference_x_diff * x_diff_ratio)
|
| 445 |
+
new_y1 = media_height - y2 - (media_reference_y_diff * y_diff_ratio)
|
| 446 |
+
new_x2 = x2 - (media_reference_x_diff * x_diff_ratio)
|
| 447 |
+
new_y2 = media_height - y1 - (media_reference_y_diff * y_diff_ratio)
|
| 448 |
+
|
| 449 |
+
return new_x1, new_y1, new_x2, new_y2
|
| 450 |
|
| 451 |
def convert_pikepdf_to_image_coords(pymupdf_page, annot, image:Image):
|
| 452 |
'''
|
|
|
|
| 501 |
|
| 502 |
return x1, new_y1, x2, new_y2
|
| 503 |
|
| 504 |
+
# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
| 505 |
+
# '''
|
| 506 |
+
# Converts coordinates from pymupdf format to image coordinates.
|
| 507 |
+
# '''
|
| 508 |
+
|
| 509 |
+
# rect_height = pymupdf_page.rect.height
|
| 510 |
+
# rect_width = pymupdf_page.rect.width
|
| 511 |
+
|
| 512 |
+
# image_page_width, image_page_height = image.size
|
| 513 |
+
|
| 514 |
+
# # Calculate scaling factors between pymupdf and PIL image
|
| 515 |
+
# scale_width = image_page_width / rect_width
|
| 516 |
+
# scale_height = image_page_height / rect_height
|
| 517 |
+
|
| 518 |
+
# x1_image = x1 * scale_width
|
| 519 |
+
# y1_image = ((rect_height - y2) * scale_height)
|
| 520 |
+
# x2_image = x2 * scale_width
|
| 521 |
+
# y2_image = ((rect_height - y1) * scale_height)
|
| 522 |
+
|
| 523 |
+
# return x1_image, y1_image, x2_image, y2_image
|
| 524 |
+
|
| 525 |
+
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
| 526 |
+
'''
|
| 527 |
+
Converts coordinates from pymupdf format to image coordinates,
|
| 528 |
+
accounting for mediabox dimensions.
|
| 529 |
+
'''
|
| 530 |
+
|
| 531 |
+
rect_height = pymupdf_page.rect.height
|
| 532 |
+
rect_width = pymupdf_page.rect.width
|
| 533 |
+
|
| 534 |
+
# Get mediabox dimensions
|
| 535 |
+
mediabox = pymupdf_page.mediabox
|
| 536 |
+
mediabox_width = mediabox.width
|
| 537 |
+
mediabox_height = mediabox.height
|
| 538 |
+
|
| 539 |
+
image_page_width, image_page_height = image.size
|
| 540 |
+
|
| 541 |
+
# Calculate scaling factors using mediabox dimensions
|
| 542 |
+
scale_width = image_page_width / mediabox_width
|
| 543 |
+
scale_height = image_page_height / mediabox_height
|
| 544 |
+
|
| 545 |
+
print("scale_width:", scale_width)
|
| 546 |
+
print("scale_height:", scale_height)
|
| 547 |
+
|
| 548 |
+
rect_to_mediabox_x_scale = mediabox_width / rect_width
|
| 549 |
+
rect_to_mediabox_y_scale = mediabox_height / rect_height
|
| 550 |
+
|
| 551 |
+
print("rect_to_mediabox_x_scale:", rect_to_mediabox_x_scale)
|
| 552 |
+
print("rect_to_mediabox_y_scale:", rect_to_mediabox_y_scale)
|
| 553 |
+
|
| 554 |
+
# Adjust coordinates based on scaling factors
|
| 555 |
+
x1_image = (x1 * scale_width) * rect_to_mediabox_x_scale
|
| 556 |
+
y1_image = (y1 * scale_height) * rect_to_mediabox_y_scale
|
| 557 |
+
x2_image = (x2 * scale_width) * rect_to_mediabox_x_scale
|
| 558 |
+
y2_image = (y2 * scale_height) * rect_to_mediabox_y_scale
|
| 559 |
+
|
| 560 |
+
return x1_image, y1_image, x2_image, y2_image
|
| 561 |
+
|
| 562 |
def convert_gradio_annotation_coords_to_pymupdf(pymupdf_page:Page, annot:dict, image:Image):
|
| 563 |
'''
|
| 564 |
Converts an image with redaction coordinates from a gradio annotation component to pymupdf coordinates.
|
|
|
|
| 650 |
|
| 651 |
# Else it should be a pikepdf annotation object
|
| 652 |
else:
|
| 653 |
+
x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
|
| 654 |
|
| 655 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
|
| 656 |
|
| 657 |
img_annotation_box = {}
|
| 658 |
|
| 659 |
if image:
|
| 660 |
+
img_width, img_height = image.size
|
| 661 |
+
|
| 662 |
+
x1, image_y1, x2, image_y2 = convert_pymupdf_to_image_coords(page, x1, pymupdf_y1, x2, pymupdf_y2, image)
|
| 663 |
+
|
| 664 |
+
img_annotation_box["xmin"] = x1 #* (img_width / rect_width) # Use adjusted x1
|
| 665 |
+
img_annotation_box["ymin"] = image_y1 #* (img_width / rect_width) # Use adjusted y1
|
| 666 |
+
img_annotation_box["xmax"] = x2# * (img_height / rect_height) # Use adjusted x2
|
| 667 |
+
img_annotation_box["ymax"] = image_y2 #* (img_height / rect_height) # Use adjusted y2
|
| 668 |
+
img_annotation_box["color"] = (0, 0, 0)
|
| 669 |
|
| 670 |
if isinstance(annot, Dictionary):
|
|
|
|
| 671 |
img_annotation_box["label"] = str(annot["/T"])
|
|
|
|
| 672 |
else:
|
| 673 |
img_annotation_box["label"] = "REDACTION"
|
| 674 |
|
|
|
|
| 709 |
merged_bboxes = []
|
| 710 |
grouped_bboxes = defaultdict(list)
|
| 711 |
|
| 712 |
+
|
| 713 |
+
# Process signature and handwriting results
|
| 714 |
+
if signature_recogniser_results or handwriting_recogniser_results:
|
| 715 |
+
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
| 716 |
+
#print("Handwriting boxes exist at merge:", handwriting_recogniser_results)
|
| 717 |
+
merged_bboxes.extend(handwriting_recogniser_results)
|
| 718 |
+
|
| 719 |
+
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
| 720 |
+
#print("Signature boxes exist at merge:", signature_recogniser_results)
|
| 721 |
+
merged_bboxes.extend(signature_recogniser_results)
|
| 722 |
+
|
| 723 |
+
|
| 724 |
# Reconstruct bounding boxes for substrings of interest
|
| 725 |
reconstructed_bboxes = []
|
| 726 |
for bbox in bboxes:
|
|
|
|
| 810 |
|
| 811 |
merged_bboxes.append(merged_box)
|
| 812 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 813 |
#print("bboxes:", bboxes)
|
| 814 |
|
| 815 |
return merged_bboxes
|
|
|
|
| 1548 |
all_text_line_results.append((i, text_line_analyser_result))
|
| 1549 |
|
| 1550 |
elif pii_identification_method == "AWS Comprehend":
|
| 1551 |
+
|
| 1552 |
+
# First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
| 1553 |
+
custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
| 1554 |
+
|
| 1555 |
+
text_line_analyser_result = nlp_analyser.analyze(
|
| 1556 |
+
text=text_line.text,
|
| 1557 |
+
language=language,
|
| 1558 |
+
entities=custom_redact_entities,
|
| 1559 |
+
score_threshold=score_threshold,
|
| 1560 |
+
return_decision_process=True,
|
| 1561 |
+
allow_list=allow_list
|
| 1562 |
+
)
|
| 1563 |
+
all_text_line_results.append((i, text_line_analyser_result))
|
| 1564 |
+
|
| 1565 |
+
|
| 1566 |
if len(text_line.text) >= 3:
|
| 1567 |
# Add separator between lines
|
| 1568 |
if current_batch:
|
tools/helper_functions.py
CHANGED
|
@@ -3,9 +3,20 @@ import re
|
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
import unicodedata
|
|
|
|
| 6 |
|
| 7 |
def reset_state_vars():
|
| 8 |
-
return [], [], pd.DataFrame(), pd.DataFrame(), 0, ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def get_or_create_env_var(var_name, default_value):
|
| 11 |
# Get the environment variable if it exists
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
| 5 |
import unicodedata
|
| 6 |
+
from gradio_image_annotation import image_annotator
|
| 7 |
|
| 8 |
def reset_state_vars():
|
| 9 |
+
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
| 10 |
+
label="Modify redaction boxes",
|
| 11 |
+
label_list=["Redaction"],
|
| 12 |
+
label_colors=[(0, 0, 0)],
|
| 13 |
+
show_label=False,
|
| 14 |
+
sources=None,#["upload"],
|
| 15 |
+
show_clear_button=False,
|
| 16 |
+
show_share_button=False,
|
| 17 |
+
show_remove_button=False,
|
| 18 |
+
interactive=False
|
| 19 |
+
)
|
| 20 |
|
| 21 |
def get_or_create_env_var(var_name, default_value):
|
| 22 |
# Get the environment variable if it exists
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
|
@@ -10,6 +10,7 @@ import re
|
|
| 10 |
# %%
|
| 11 |
model_name = "en_core_web_lg" #"en_core_web_trf"
|
| 12 |
score_threshold = 0.001
|
|
|
|
| 13 |
|
| 14 |
# %% [markdown]
|
| 15 |
# #### Custom recognisers
|
|
|
|
| 10 |
# %%
|
| 11 |
model_name = "en_core_web_lg" #"en_core_web_trf"
|
| 12 |
score_threshold = 0.001
|
| 13 |
+
custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME"]
|
| 14 |
|
| 15 |
# %% [markdown]
|
| 16 |
# #### Custom recognisers
|
tools/redaction_review.py
CHANGED
|
@@ -37,9 +37,22 @@ def increase_page(number:int, image_annotator_object:AnnotatedImageData):
|
|
| 37 |
else:
|
| 38 |
return max_pages, max_pages
|
| 39 |
|
| 40 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
# print("\nImage annotator object:", image_annotator_object)
|
| 42 |
|
|
|
|
|
|
|
| 43 |
if not image_annotator_object:
|
| 44 |
return image_annotator(
|
| 45 |
label="Modify redaction boxes",
|
|
@@ -76,8 +89,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int):
|
|
| 76 |
#label_list=["Redaction"],
|
| 77 |
#label_colors=[(0, 0, 0)],
|
| 78 |
show_label=False,
|
| 79 |
-
height=
|
| 80 |
-
width=
|
| 81 |
box_min_size=1,
|
| 82 |
box_selected_thickness=2,
|
| 83 |
handle_size=4,
|
|
|
|
| 37 |
else:
|
| 38 |
return max_pages, max_pages
|
| 39 |
|
| 40 |
+
def update_zoom(current_zoom_level:int, decrease:bool=True):
|
| 41 |
+
if decrease == False:
|
| 42 |
+
if current_zoom_level >= 50:
|
| 43 |
+
current_zoom_level -= 10
|
| 44 |
+
else:
|
| 45 |
+
if current_zoom_level < 100:
|
| 46 |
+
current_zoom_level += 10
|
| 47 |
+
|
| 48 |
+
return current_zoom_level
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, zoom:int=100):
|
| 52 |
# print("\nImage annotator object:", image_annotator_object)
|
| 53 |
|
| 54 |
+
zoom_str = str(zoom) + '%'
|
| 55 |
+
|
| 56 |
if not image_annotator_object:
|
| 57 |
return image_annotator(
|
| 58 |
label="Modify redaction boxes",
|
|
|
|
| 89 |
#label_list=["Redaction"],
|
| 90 |
#label_colors=[(0, 0, 0)],
|
| 91 |
show_label=False,
|
| 92 |
+
height=zoom_str,
|
| 93 |
+
width=zoom_str,
|
| 94 |
box_min_size=1,
|
| 95 |
box_selected_thickness=2,
|
| 96 |
handle_size=4,
|