Commit
·
bde6e5b
1
Parent(s):
6b28cfa
Fuzzy match implementation for deny list. Added option to merge multiple review files. Review files from redaction step should now include text.
Browse files- app.py +25 -9
- requirements.txt +2 -0
- tools/custom_image_analyser_engine.py +42 -11
- tools/data_anonymise.py +3 -3
- tools/file_conversion.py +10 -6
- tools/file_redaction.py +63 -34
- tools/helper_functions.py +64 -23
- tools/load_spacy_model_custom_recognisers.py +176 -25
- tools/redaction_review.py +13 -5
app.py
CHANGED
|
@@ -10,7 +10,7 @@ from datetime import datetime
|
|
| 10 |
from gradio_image_annotation import image_annotator
|
| 11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 12 |
|
| 13 |
-
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars
|
| 14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
| 15 |
from tools.file_redaction import choose_and_run_redactor
|
| 16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
|
@@ -30,15 +30,16 @@ ensure_output_folder_exists()
|
|
| 30 |
|
| 31 |
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
| 32 |
|
| 33 |
-
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
|
| 34 |
|
| 35 |
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
|
| 36 |
chosen_comprehend_entities.extend(custom_entities)
|
| 37 |
full_comprehend_entity_list.extend(custom_entities)
|
| 38 |
|
|
|
|
| 39 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
|
| 40 |
|
| 41 |
-
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
|
| 42 |
|
| 43 |
language = 'en'
|
| 44 |
|
|
@@ -68,7 +69,6 @@ with app:
|
|
| 68 |
pdf_doc_state = gr.State([])
|
| 69 |
all_image_annotations_state = gr.State([])
|
| 70 |
|
| 71 |
-
|
| 72 |
all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
| 73 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
| 74 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
|
@@ -261,7 +261,7 @@ with app:
|
|
| 261 |
|
| 262 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
| 263 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
| 264 |
-
adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple')
|
| 265 |
convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
|
| 266 |
|
| 267 |
###
|
|
@@ -325,9 +325,12 @@ with app:
|
|
| 325 |
|
| 326 |
with gr.Accordion("Select entity types to redact", open = True):
|
| 327 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
|
| 328 |
-
|
| 329 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
|
| 330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
with gr.Accordion("Redact only selected pages", open = False):
|
| 332 |
with gr.Row():
|
| 333 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
|
@@ -341,7 +344,16 @@ with app:
|
|
| 341 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
| 342 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
| 343 |
|
| 344 |
-
log_files_output = gr.File(label="Log file output", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
###
|
| 347 |
# PDF/IMAGE REDACTION
|
|
@@ -350,12 +362,12 @@ with app:
|
|
| 350 |
|
| 351 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 352 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
| 353 |
-
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 354 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
| 355 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 356 |
|
| 357 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
| 358 |
-
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
| 359 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
|
| 360 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 361 |
|
|
@@ -461,6 +473,10 @@ with app:
|
|
| 461 |
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
| 462 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
| 463 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
|
| 466 |
###
|
|
|
|
| 10 |
from gradio_image_annotation import image_annotator
|
| 11 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 12 |
|
| 13 |
+
from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
|
| 14 |
from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
|
| 15 |
from tools.file_redaction import choose_and_run_redactor
|
| 16 |
from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
|
|
|
|
| 30 |
|
| 31 |
chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
|
| 32 |
|
| 33 |
+
full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
|
| 34 |
|
| 35 |
# Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
|
| 36 |
chosen_comprehend_entities.extend(custom_entities)
|
| 37 |
full_comprehend_entity_list.extend(custom_entities)
|
| 38 |
|
| 39 |
+
# Entities for local PII redaction option
|
| 40 |
chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
|
| 41 |
|
| 42 |
+
full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
|
| 43 |
|
| 44 |
language = 'en'
|
| 45 |
|
|
|
|
| 69 |
pdf_doc_state = gr.State([])
|
| 70 |
all_image_annotations_state = gr.State([])
|
| 71 |
|
|
|
|
| 72 |
all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
| 73 |
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
| 74 |
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
|
|
|
| 261 |
|
| 262 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
| 263 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
| 264 |
+
adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
|
| 265 |
convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
|
| 266 |
|
| 267 |
###
|
|
|
|
| 325 |
|
| 326 |
with gr.Accordion("Select entity types to redact", open = True):
|
| 327 |
in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
|
|
|
|
| 328 |
in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
|
| 329 |
|
| 330 |
+
with gr.Row():
|
| 331 |
+
max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
|
| 332 |
+
match_fuzzy_whole_phrase_bool = gr.Checkbox(label="Should fuzzy match on entire phrases in deny list (as opposed to each word individually)?", value=True)
|
| 333 |
+
|
| 334 |
with gr.Accordion("Redact only selected pages", open = False):
|
| 335 |
with gr.Row():
|
| 336 |
page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
|
|
|
|
| 344 |
with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
|
| 345 |
anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
|
| 346 |
|
| 347 |
+
log_files_output = gr.File(label="Log file output", interactive=False)
|
| 348 |
+
|
| 349 |
+
with gr.Accordion("Combine multiple review files", open = False):
|
| 350 |
+
multiple_review_files_in_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv'])
|
| 351 |
+
merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
|
| 356 |
+
### UI INTERACTION ###
|
| 357 |
|
| 358 |
###
|
| 359 |
# PDF/IMAGE REDACTION
|
|
|
|
| 362 |
|
| 363 |
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
| 364 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
| 365 |
+
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
|
| 366 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
| 367 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 368 |
|
| 369 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
| 370 |
+
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
|
| 371 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
|
| 372 |
then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
| 373 |
|
|
|
|
| 473 |
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
| 474 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
| 475 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
| 476 |
+
|
| 477 |
+
|
| 478 |
+
# Merge multiple review csv files together
|
| 479 |
+
merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
|
| 480 |
|
| 481 |
|
| 482 |
###
|
requirements.txt
CHANGED
|
@@ -16,6 +16,8 @@ boto3==1.35.83
|
|
| 16 |
pyarrow==18.1.0
|
| 17 |
openpyxl==3.1.2
|
| 18 |
Faker==22.2.0
|
|
|
|
|
|
|
| 19 |
gradio_image_annotation==0.2.5
|
| 20 |
numpy==1.26.4
|
| 21 |
awslambdaric==3.0.0
|
|
|
|
| 16 |
pyarrow==18.1.0
|
| 17 |
openpyxl==3.1.2
|
| 18 |
Faker==22.2.0
|
| 19 |
+
python-levenshtein==0.26.1
|
| 20 |
+
spaczz==0.6.1
|
| 21 |
gradio_image_annotation==0.2.5
|
| 22 |
numpy==1.26.4
|
| 23 |
awslambdaric==3.0.0
|
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -560,7 +560,7 @@ def run_page_text_redaction(
|
|
| 560 |
if not nlp_analyser:
|
| 561 |
raise ValueError("nlp_analyser is required for Local identification method")
|
| 562 |
|
| 563 |
-
print("page text:", page_text)
|
| 564 |
|
| 565 |
page_analyser_result = nlp_analyser.analyze(
|
| 566 |
text=page_text,
|
|
@@ -1077,15 +1077,15 @@ class CustomImageAnalyzerEngine:
|
|
| 1077 |
line_length = len(line_text)
|
| 1078 |
redaction_text = redaction_relevant_ocr_result.text
|
| 1079 |
|
| 1080 |
-
#
|
| 1081 |
|
| 1082 |
for redaction_result in text_analyzer_results:
|
| 1083 |
-
#
|
| 1084 |
-
#
|
| 1085 |
-
#
|
| 1086 |
-
#
|
| 1087 |
|
| 1088 |
-
# Check if the redaction text is
|
| 1089 |
|
| 1090 |
if redaction_text not in allow_list:
|
| 1091 |
|
|
@@ -1098,14 +1098,45 @@ class CustomImageAnalyzerEngine:
|
|
| 1098 |
matched_words = matched_text.split()
|
| 1099 |
|
| 1100 |
# print(f"Found match: '{matched_text}' in line")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1101 |
|
| 1102 |
# Find the corresponding words in the OCR results
|
| 1103 |
matching_word_boxes = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1104 |
for word_info in ocr_results_with_children_child_info.get('words', []):
|
| 1105 |
-
|
| 1106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1107 |
matching_word_boxes.append(word_info['bounding_box'])
|
| 1108 |
-
#
|
| 1109 |
|
| 1110 |
if matching_word_boxes:
|
| 1111 |
# Calculate the combined bounding box for all matching words
|
|
@@ -1127,7 +1158,7 @@ class CustomImageAnalyzerEngine:
|
|
| 1127 |
text=matched_text
|
| 1128 |
)
|
| 1129 |
)
|
| 1130 |
-
#
|
| 1131 |
|
| 1132 |
return redaction_bboxes
|
| 1133 |
|
|
|
|
| 560 |
if not nlp_analyser:
|
| 561 |
raise ValueError("nlp_analyser is required for Local identification method")
|
| 562 |
|
| 563 |
+
#print("page text:", page_text)
|
| 564 |
|
| 565 |
page_analyser_result = nlp_analyser.analyze(
|
| 566 |
text=page_text,
|
|
|
|
| 1077 |
line_length = len(line_text)
|
| 1078 |
redaction_text = redaction_relevant_ocr_result.text
|
| 1079 |
|
| 1080 |
+
#print(f"Processing line: '{line_text}'")
|
| 1081 |
|
| 1082 |
for redaction_result in text_analyzer_results:
|
| 1083 |
+
#print(f"Checking redaction result: {redaction_result}")
|
| 1084 |
+
#print("redaction_text:", redaction_text)
|
| 1085 |
+
#print("line_length:", line_length)
|
| 1086 |
+
#print("line_text:", line_text)
|
| 1087 |
|
| 1088 |
+
# Check if the redaction text is not in the allow list
|
| 1089 |
|
| 1090 |
if redaction_text not in allow_list:
|
| 1091 |
|
|
|
|
| 1098 |
matched_words = matched_text.split()
|
| 1099 |
|
| 1100 |
# print(f"Found match: '{matched_text}' in line")
|
| 1101 |
+
|
| 1102 |
+
# for word_info in ocr_results_with_children_child_info.get('words', []):
|
| 1103 |
+
# # Check if this word is part of our match
|
| 1104 |
+
# if any(word.lower() in word_info['text'].lower() for word in matched_words):
|
| 1105 |
+
# matching_word_boxes.append(word_info['bounding_box'])
|
| 1106 |
+
# print(f"Matched word: {word_info['text']}")
|
| 1107 |
|
| 1108 |
# Find the corresponding words in the OCR results
|
| 1109 |
matching_word_boxes = []
|
| 1110 |
+
|
| 1111 |
+
#print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
|
| 1112 |
+
|
| 1113 |
+
current_position = 0
|
| 1114 |
+
|
| 1115 |
for word_info in ocr_results_with_children_child_info.get('words', []):
|
| 1116 |
+
word_text = word_info['text']
|
| 1117 |
+
word_length = len(word_text)
|
| 1118 |
+
|
| 1119 |
+
# Assign start and end character positions
|
| 1120 |
+
#word_info['start_position'] = current_position
|
| 1121 |
+
#word_info['end_position'] = current_position + word_length
|
| 1122 |
+
|
| 1123 |
+
word_start = current_position
|
| 1124 |
+
word_end = current_position + word_length
|
| 1125 |
+
|
| 1126 |
+
# Update current position for the next word
|
| 1127 |
+
current_position += word_length + 1 # +1 for the space after the word
|
| 1128 |
+
|
| 1129 |
+
#print("word_info['bounding_box']:", word_info['bounding_box'])
|
| 1130 |
+
#print("word_start:", word_start)
|
| 1131 |
+
#print("start_in_line:", start_in_line)
|
| 1132 |
+
|
| 1133 |
+
#print("word_end:", word_end)
|
| 1134 |
+
#print("end_in_line:", end_in_line)
|
| 1135 |
+
|
| 1136 |
+
# Check if the word's bounding box is within the start and end bounds
|
| 1137 |
+
if word_start >= start_in_line and word_end <= (end_in_line + 1):
|
| 1138 |
matching_word_boxes.append(word_info['bounding_box'])
|
| 1139 |
+
#print(f"Matched word: {word_info['text']}")
|
| 1140 |
|
| 1141 |
if matching_word_boxes:
|
| 1142 |
# Calculate the combined bounding box for all matching words
|
|
|
|
| 1158 |
text=matched_text
|
| 1159 |
)
|
| 1160 |
)
|
| 1161 |
+
#print(f"Added bounding box for: '{matched_text}'")
|
| 1162 |
|
| 1163 |
return redaction_bboxes
|
| 1164 |
|
tools/data_anonymise.py
CHANGED
|
@@ -12,7 +12,7 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerR
|
|
| 12 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
| 13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
| 14 |
|
| 15 |
-
from tools.helper_functions import output_folder,
|
| 16 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
| 17 |
|
| 18 |
# Use custom version of analyze_dict to be able to track progress
|
|
@@ -434,7 +434,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
| 434 |
file_type = detect_file_type(anon_file)
|
| 435 |
print("File type is:", file_type)
|
| 436 |
|
| 437 |
-
out_file_part =
|
| 438 |
|
| 439 |
if file_type == 'xlsx':
|
| 440 |
print("Running through all xlsx sheets")
|
|
@@ -472,7 +472,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
| 472 |
else:
|
| 473 |
sheet_name = ""
|
| 474 |
anon_df = read_file(anon_file)
|
| 475 |
-
out_file_part =
|
| 476 |
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
| 477 |
|
| 478 |
# Increase latest file completed count unless we are at the last file
|
|
|
|
| 12 |
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
|
| 13 |
from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
|
| 14 |
|
| 15 |
+
from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
|
| 16 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
|
| 17 |
|
| 18 |
# Use custom version of analyze_dict to be able to track progress
|
|
|
|
| 434 |
file_type = detect_file_type(anon_file)
|
| 435 |
print("File type is:", file_type)
|
| 436 |
|
| 437 |
+
out_file_part = get_file_name_without_type(anon_file.name)
|
| 438 |
|
| 439 |
if file_type == 'xlsx':
|
| 440 |
print("Running through all xlsx sheets")
|
|
|
|
| 472 |
else:
|
| 473 |
sheet_name = ""
|
| 474 |
anon_df = read_file(anon_file)
|
| 475 |
+
out_file_part = get_file_name_without_type(anon_file.name)
|
| 476 |
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
| 477 |
|
| 478 |
# Increase latest file completed count unless we are at the last file
|
tools/file_conversion.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
| 2 |
-
from tools.helper_functions import
|
| 3 |
from PIL import Image, ImageFile
|
| 4 |
import os
|
| 5 |
import re
|
|
@@ -7,6 +7,7 @@ import time
|
|
| 7 |
import json
|
| 8 |
import pymupdf
|
| 9 |
import pandas as pd
|
|
|
|
| 10 |
from pymupdf import Rect
|
| 11 |
from fitz import Page
|
| 12 |
from tqdm import tqdm
|
|
@@ -240,7 +241,7 @@ def get_input_file_names(file_input:List[str]):
|
|
| 240 |
else:
|
| 241 |
file_path = file.name
|
| 242 |
|
| 243 |
-
file_path_without_ext =
|
| 244 |
|
| 245 |
file_extension = os.path.splitext(file_path)[1].lower()
|
| 246 |
|
|
@@ -489,7 +490,7 @@ def prepare_image_or_pdf(
|
|
| 489 |
file_path = file
|
| 490 |
else:
|
| 491 |
file_path = file.name
|
| 492 |
-
file_path_without_ext =
|
| 493 |
file_name_with_ext = os.path.basename(file_path)
|
| 494 |
|
| 495 |
if not file_path:
|
|
@@ -668,7 +669,7 @@ def prepare_image_or_pdf(
|
|
| 668 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
| 669 |
|
| 670 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
| 671 |
-
file_path_without_ext =
|
| 672 |
|
| 673 |
out_file_paths = out_text_file_path
|
| 674 |
|
|
@@ -754,7 +755,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
| 754 |
if 'text' not in box:
|
| 755 |
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
|
| 756 |
else:
|
| 757 |
-
data_to_add = {"image": image_path, "page": reported_number, "text":
|
| 758 |
#print("data_to_add:", data_to_add)
|
| 759 |
flattened_annotation_data.append(data_to_add)
|
| 760 |
|
|
@@ -764,7 +765,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
| 764 |
#print("redaction_decision_output:", redaction_decision_output)
|
| 765 |
#print("annotation_data_as_df:", annotation_data_as_df)
|
| 766 |
|
| 767 |
-
# Join on additional text data from decision output results if included
|
| 768 |
if not redaction_decision_output.empty:
|
| 769 |
#print("redaction_decision_output is not empty")
|
| 770 |
#print("redaction_decision_output:", redaction_decision_output)
|
|
@@ -793,6 +794,9 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
|
|
| 793 |
if col not in annotation_data_as_df.columns:
|
| 794 |
annotation_data_as_df[col] = ''
|
| 795 |
|
|
|
|
|
|
|
|
|
|
| 796 |
annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
| 797 |
|
| 798 |
return annotation_data_as_df
|
|
|
|
| 1 |
from pdf2image import convert_from_path, pdfinfo_from_path
|
| 2 |
+
from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
|
| 3 |
from PIL import Image, ImageFile
|
| 4 |
import os
|
| 5 |
import re
|
|
|
|
| 7 |
import json
|
| 8 |
import pymupdf
|
| 9 |
import pandas as pd
|
| 10 |
+
import numpy as np
|
| 11 |
from pymupdf import Rect
|
| 12 |
from fitz import Page
|
| 13 |
from tqdm import tqdm
|
|
|
|
| 241 |
else:
|
| 242 |
file_path = file.name
|
| 243 |
|
| 244 |
+
file_path_without_ext = get_file_name_without_type(file_path)
|
| 245 |
|
| 246 |
file_extension = os.path.splitext(file_path)[1].lower()
|
| 247 |
|
|
|
|
| 490 |
file_path = file
|
| 491 |
else:
|
| 492 |
file_path = file.name
|
| 493 |
+
file_path_without_ext = get_file_name_without_type(file_path)
|
| 494 |
file_name_with_ext = os.path.basename(file_path)
|
| 495 |
|
| 496 |
if not file_path:
|
|
|
|
| 669 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
| 670 |
|
| 671 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
| 672 |
+
file_path_without_ext = get_file_name_without_type(in_file_path)
|
| 673 |
|
| 674 |
out_file_paths = out_text_file_path
|
| 675 |
|
|
|
|
| 755 |
if 'text' not in box:
|
| 756 |
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
|
| 757 |
else:
|
| 758 |
+
data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
|
| 759 |
#print("data_to_add:", data_to_add)
|
| 760 |
flattened_annotation_data.append(data_to_add)
|
| 761 |
|
|
|
|
| 765 |
#print("redaction_decision_output:", redaction_decision_output)
|
| 766 |
#print("annotation_data_as_df:", annotation_data_as_df)
|
| 767 |
|
| 768 |
+
# Join on additional text data from decision output results if included, if text not already there
|
| 769 |
if not redaction_decision_output.empty:
|
| 770 |
#print("redaction_decision_output is not empty")
|
| 771 |
#print("redaction_decision_output:", redaction_decision_output)
|
|
|
|
| 794 |
if col not in annotation_data_as_df.columns:
|
| 795 |
annotation_data_as_df[col] = ''
|
| 796 |
|
| 797 |
+
for col in ['xmin', 'xmax', 'ymin', 'ymax']:
|
| 798 |
+
annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
|
| 799 |
+
|
| 800 |
annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
| 801 |
|
| 802 |
return annotation_data_as_df
|
tools/file_redaction.py
CHANGED
|
@@ -27,8 +27,8 @@ from presidio_analyzer import RecognizerResult
|
|
| 27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
| 28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
| 29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
| 30 |
-
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
|
| 31 |
-
from tools.helper_functions import
|
| 32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
| 33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
| 34 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
|
@@ -94,6 +94,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 94 |
page_break_return:bool=False,
|
| 95 |
pii_identification_method:str="Local",
|
| 96 |
comprehend_query_number:int=0,
|
|
|
|
|
|
|
| 97 |
output_folder:str=output_folder,
|
| 98 |
progress=gr.Progress(track_tqdm=True)):
|
| 99 |
'''
|
|
@@ -127,6 +129,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 127 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
| 128 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
| 129 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
|
|
|
|
|
|
| 130 |
- output_folder (str, optional): Output folder for results.
|
| 131 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 132 |
|
|
@@ -279,9 +283,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 279 |
file_path = file.name
|
| 280 |
|
| 281 |
if file_path:
|
| 282 |
-
pdf_file_name_without_ext =
|
| 283 |
pdf_file_name_with_ext = os.path.basename(file_path)
|
| 284 |
-
print("Redacting file:", pdf_file_name_with_ext)
|
| 285 |
|
| 286 |
is_a_pdf = is_pdf(file_path) == True
|
| 287 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
|
@@ -327,7 +331,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 327 |
comprehend_client,
|
| 328 |
textract_client,
|
| 329 |
custom_recogniser_word_list,
|
| 330 |
-
redact_whole_page_list
|
|
|
|
|
|
|
| 331 |
|
| 332 |
|
| 333 |
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
|
@@ -366,7 +372,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 366 |
comprehend_query_number,
|
| 367 |
comprehend_client,
|
| 368 |
custom_recogniser_word_list,
|
| 369 |
-
redact_whole_page_list
|
|
|
|
|
|
|
| 370 |
|
| 371 |
else:
|
| 372 |
out_message = "No redaction method selected"
|
|
@@ -414,13 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 414 |
|
| 415 |
# Save the gradio_annotation_boxes to a JSON file
|
| 416 |
try:
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
| 420 |
-
with open(out_annotation_file_path, 'w') as f:
|
| 421 |
-
json.dump(annotations_all_pages, f)
|
| 422 |
-
log_files_output_paths.append(out_annotation_file_path)
|
| 423 |
-
|
| 424 |
#print("Saving annotations to CSV")
|
| 425 |
|
| 426 |
# Convert json to csv and also save this
|
|
@@ -435,6 +437,13 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
| 435 |
|
| 436 |
print("Saved review file to csv")
|
| 437 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
except Exception as e:
|
| 439 |
print("Could not save annotations to json or csv file:", e)
|
| 440 |
|
|
@@ -694,10 +703,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
| 694 |
x1 = pymupdf_x1
|
| 695 |
x2 = pymupdf_x2
|
| 696 |
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
|
| 702 |
# Else should be CustomImageRecognizerResult
|
| 703 |
else:
|
|
@@ -715,10 +724,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
| 715 |
img_annotation_box["label"] = annot.entity_type
|
| 716 |
except:
|
| 717 |
img_annotation_box["label"] = "Redaction"
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
|
|
|
| 722 |
|
| 723 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
| 724 |
|
|
@@ -749,12 +759,14 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
| 749 |
|
| 750 |
if isinstance(annot, Dictionary):
|
| 751 |
img_annotation_box["label"] = str(annot["/T"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 752 |
else:
|
| 753 |
img_annotation_box["label"] = "REDACTION"
|
| 754 |
-
|
| 755 |
-
# img_annotation_box["text"] = annot.text
|
| 756 |
-
# else:
|
| 757 |
-
# img_annotation_box["text"] = ""
|
| 758 |
|
| 759 |
# Convert to a PyMuPDF Rect object
|
| 760 |
#rect = Rect(rect_coordinates)
|
|
@@ -913,6 +925,8 @@ def redact_image_pdf(file_path:str,
|
|
| 913 |
textract_client:str="",
|
| 914 |
custom_recogniser_word_list:List[str]=[],
|
| 915 |
redact_whole_page_list:List[str]=[],
|
|
|
|
|
|
|
| 916 |
page_break_val:int=int(page_break_value),
|
| 917 |
log_files_output_paths:List=[],
|
| 918 |
max_time:int=int(max_time_value),
|
|
@@ -945,14 +959,16 @@ def redact_image_pdf(file_path:str,
|
|
| 945 |
- textract_client (optional): A connection to the AWS Textract service via the boto3 package.
|
| 946 |
- custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
|
| 947 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
|
|
|
|
|
|
| 948 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
| 949 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
| 950 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 951 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 952 |
|
| 953 |
-
The function returns a
|
| 954 |
'''
|
| 955 |
-
file_name =
|
| 956 |
fill = (0, 0, 0) # Fill colour for redactions
|
| 957 |
comprehend_query_number_new = 0
|
| 958 |
|
|
@@ -962,11 +978,14 @@ def redact_image_pdf(file_path:str,
|
|
| 962 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
| 963 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
| 964 |
#print("new_custom_recogniser:", new_custom_recogniser)
|
| 965 |
-
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
| 966 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 967 |
|
| 968 |
-
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
| 969 |
-
|
| 970 |
|
| 971 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
| 972 |
print("Connection to AWS Comprehend service unsuccessful.")
|
|
@@ -1190,6 +1209,7 @@ def redact_image_pdf(file_path:str,
|
|
| 1190 |
|
| 1191 |
## Apply annotations with pymupdf
|
| 1192 |
else:
|
|
|
|
| 1193 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
| 1194 |
if redact_whole_page_list:
|
| 1195 |
int_reported_page_number = int(reported_page_number)
|
|
@@ -1471,6 +1491,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
| 1471 |
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
| 1472 |
pikepdf_annotations_on_page = []
|
| 1473 |
for analysed_bounding_box in analysed_bounding_boxes:
|
|
|
|
|
|
|
| 1474 |
bounding_box = analysed_bounding_box["boundingBox"]
|
| 1475 |
annotation = Dictionary(
|
| 1476 |
Type=Name.Annot,
|
|
@@ -1482,6 +1504,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
| 1482 |
IC=[0, 0, 0],
|
| 1483 |
CA=1, # Transparency
|
| 1484 |
T=analysed_bounding_box["result"].entity_type,
|
|
|
|
| 1485 |
BS=Dictionary(
|
| 1486 |
W=0, # Border width: 1 point
|
| 1487 |
S=Name.S # Border style: solid
|
|
@@ -1511,6 +1534,8 @@ def redact_text_pdf(
|
|
| 1511 |
comprehend_client="",
|
| 1512 |
custom_recogniser_word_list:List[str]=[],
|
| 1513 |
redact_whole_page_list:List[str]=[],
|
|
|
|
|
|
|
| 1514 |
page_break_val: int = int(page_break_value), # Value for page break
|
| 1515 |
max_time: int = int(max_time_value),
|
| 1516 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
@@ -1540,6 +1565,8 @@ def redact_text_pdf(
|
|
| 1540 |
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
| 1541 |
- custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
|
| 1542 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
|
|
|
|
|
|
| 1543 |
- page_break_val: Value for page break
|
| 1544 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 1545 |
- progress: Progress tracking object
|
|
@@ -1555,9 +1582,12 @@ def redact_text_pdf(
|
|
| 1555 |
if custom_recogniser_word_list:
|
| 1556 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
| 1557 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
| 1558 |
-
#print("new_custom_recogniser:", new_custom_recogniser)
|
| 1559 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
| 1560 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1561 |
# List all elements currently in the nlp_analyser registry
|
| 1562 |
#print("Current recognizers in nlp_analyser registry:")
|
| 1563 |
#for recognizer_name in nlp_analyser.registry.recognizers:
|
|
@@ -1660,7 +1690,7 @@ def redact_text_pdf(
|
|
| 1660 |
language,
|
| 1661 |
chosen_redact_entities,
|
| 1662 |
chosen_redact_comprehend_entities,
|
| 1663 |
-
all_line_level_text_results_list,
|
| 1664 |
all_line_characters,
|
| 1665 |
page_analyser_results,
|
| 1666 |
page_analysed_bounding_boxes,
|
|
@@ -1673,7 +1703,6 @@ def redact_text_pdf(
|
|
| 1673 |
comprehend_query_number
|
| 1674 |
)
|
| 1675 |
|
| 1676 |
-
|
| 1677 |
#print("page_analyser_results:", page_analyser_results)
|
| 1678 |
#print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
| 1679 |
#print("image:", image)
|
|
@@ -1688,7 +1717,7 @@ def redact_text_pdf(
|
|
| 1688 |
# Annotate redactions on page
|
| 1689 |
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
| 1690 |
|
| 1691 |
-
#print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
|
| 1692 |
|
| 1693 |
# Make pymupdf page redactions
|
| 1694 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
|
|
|
| 27 |
from tools.aws_functions import RUN_AWS_FUNCTIONS
|
| 28 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
| 29 |
from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
|
| 30 |
+
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
| 31 |
+
from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
|
| 32 |
from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
|
| 33 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
|
| 34 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
|
|
|
| 94 |
page_break_return:bool=False,
|
| 95 |
pii_identification_method:str="Local",
|
| 96 |
comprehend_query_number:int=0,
|
| 97 |
+
max_fuzzy_spelling_mistakes_num:int=1,
|
| 98 |
+
match_fuzzy_whole_phrase_bool:bool=True,
|
| 99 |
output_folder:str=output_folder,
|
| 100 |
progress=gr.Progress(track_tqdm=True)):
|
| 101 |
'''
|
|
|
|
| 129 |
- page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
|
| 130 |
- pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
|
| 131 |
- comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
|
| 132 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
| 133 |
+
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
| 134 |
- output_folder (str, optional): Output folder for results.
|
| 135 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 136 |
|
|
|
|
| 283 |
file_path = file.name
|
| 284 |
|
| 285 |
if file_path:
|
| 286 |
+
pdf_file_name_without_ext = get_file_name_without_type(file_path)
|
| 287 |
pdf_file_name_with_ext = os.path.basename(file_path)
|
| 288 |
+
# print("Redacting file:", pdf_file_name_with_ext)
|
| 289 |
|
| 290 |
is_a_pdf = is_pdf(file_path) == True
|
| 291 |
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
|
|
|
| 331 |
comprehend_client,
|
| 332 |
textract_client,
|
| 333 |
custom_recogniser_word_list,
|
| 334 |
+
redact_whole_page_list,
|
| 335 |
+
max_fuzzy_spelling_mistakes_num,
|
| 336 |
+
match_fuzzy_whole_phrase_bool)
|
| 337 |
|
| 338 |
|
| 339 |
#print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
|
|
|
| 372 |
comprehend_query_number,
|
| 373 |
comprehend_client,
|
| 374 |
custom_recogniser_word_list,
|
| 375 |
+
redact_whole_page_list,
|
| 376 |
+
max_fuzzy_spelling_mistakes_num,
|
| 377 |
+
match_fuzzy_whole_phrase_bool)
|
| 378 |
|
| 379 |
else:
|
| 380 |
out_message = "No redaction method selected"
|
|
|
|
| 422 |
|
| 423 |
# Save the gradio_annotation_boxes to a JSON file
|
| 424 |
try:
|
| 425 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
#print("Saving annotations to CSV")
|
| 427 |
|
| 428 |
# Convert json to csv and also save this
|
|
|
|
| 437 |
|
| 438 |
print("Saved review file to csv")
|
| 439 |
|
| 440 |
+
out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
|
| 441 |
+
with open(out_annotation_file_path, 'w') as f:
|
| 442 |
+
json.dump(annotations_all_pages, f)
|
| 443 |
+
log_files_output_paths.append(out_annotation_file_path)
|
| 444 |
+
|
| 445 |
+
print("Saving annotations to JSON")
|
| 446 |
+
|
| 447 |
except Exception as e:
|
| 448 |
print("Could not save annotations to json or csv file:", e)
|
| 449 |
|
|
|
|
| 703 |
x1 = pymupdf_x1
|
| 704 |
x2 = pymupdf_x2
|
| 705 |
|
| 706 |
+
if hasattr(annot, 'text') and annot.text:
|
| 707 |
+
img_annotation_box["text"] = annot.text
|
| 708 |
+
else:
|
| 709 |
+
img_annotation_box["text"] = ""
|
| 710 |
|
| 711 |
# Else should be CustomImageRecognizerResult
|
| 712 |
else:
|
|
|
|
| 724 |
img_annotation_box["label"] = annot.entity_type
|
| 725 |
except:
|
| 726 |
img_annotation_box["label"] = "Redaction"
|
| 727 |
+
|
| 728 |
+
if hasattr(annot, 'text') and annot.text:
|
| 729 |
+
img_annotation_box["text"] = annot.text
|
| 730 |
+
else:
|
| 731 |
+
img_annotation_box["text"] = ""
|
| 732 |
|
| 733 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
|
| 734 |
|
|
|
|
| 759 |
|
| 760 |
if isinstance(annot, Dictionary):
|
| 761 |
img_annotation_box["label"] = str(annot["/T"])
|
| 762 |
+
|
| 763 |
+
if hasattr(annot, 'Contents'):
|
| 764 |
+
img_annotation_box["text"] = annot.Contents
|
| 765 |
+
else:
|
| 766 |
+
img_annotation_box["text"] = ""
|
| 767 |
else:
|
| 768 |
img_annotation_box["label"] = "REDACTION"
|
| 769 |
+
img_annotation_box["text"] = ""
|
|
|
|
|
|
|
|
|
|
| 770 |
|
| 771 |
# Convert to a PyMuPDF Rect object
|
| 772 |
#rect = Rect(rect_coordinates)
|
|
|
|
| 925 |
textract_client:str="",
|
| 926 |
custom_recogniser_word_list:List[str]=[],
|
| 927 |
redact_whole_page_list:List[str]=[],
|
| 928 |
+
max_fuzzy_spelling_mistakes_num:int=1,
|
| 929 |
+
match_fuzzy_whole_phrase_bool:bool=True,
|
| 930 |
page_break_val:int=int(page_break_value),
|
| 931 |
log_files_output_paths:List=[],
|
| 932 |
max_time:int=int(max_time_value),
|
|
|
|
| 959 |
- textract_client (optional): A connection to the AWS Textract service via the boto3 package.
|
| 960 |
- custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
|
| 961 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
| 962 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
| 963 |
+
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
| 964 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
|
| 965 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
| 966 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 967 |
- progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
| 968 |
|
| 969 |
+
The function returns a redacted PDF document along with processing output objects.
|
| 970 |
'''
|
| 971 |
+
file_name = get_file_name_without_type(file_path)
|
| 972 |
fill = (0, 0, 0) # Fill colour for redactions
|
| 973 |
comprehend_query_number_new = 0
|
| 974 |
|
|
|
|
| 978 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
| 979 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
| 980 |
#print("new_custom_recogniser:", new_custom_recogniser)
|
| 981 |
+
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
| 982 |
|
| 983 |
+
nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
|
| 984 |
+
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
| 985 |
+
#print("new_custom_recogniser:", new_custom_recogniser)
|
| 986 |
+
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
| 987 |
|
| 988 |
+
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
|
|
|
| 989 |
|
| 990 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
| 991 |
print("Connection to AWS Comprehend service unsuccessful.")
|
|
|
|
| 1209 |
|
| 1210 |
## Apply annotations with pymupdf
|
| 1211 |
else:
|
| 1212 |
+
print("merged_redaction_boxes:", merged_redaction_bboxes)
|
| 1213 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
| 1214 |
if redact_whole_page_list:
|
| 1215 |
int_reported_page_number = int(reported_page_number)
|
|
|
|
| 1491 |
def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
| 1492 |
pikepdf_annotations_on_page = []
|
| 1493 |
for analysed_bounding_box in analysed_bounding_boxes:
|
| 1494 |
+
#print("analysed_bounding_box:", analysed_bounding_boxes)
|
| 1495 |
+
|
| 1496 |
bounding_box = analysed_bounding_box["boundingBox"]
|
| 1497 |
annotation = Dictionary(
|
| 1498 |
Type=Name.Annot,
|
|
|
|
| 1504 |
IC=[0, 0, 0],
|
| 1505 |
CA=1, # Transparency
|
| 1506 |
T=analysed_bounding_box["result"].entity_type,
|
| 1507 |
+
Contents=analysed_bounding_box["text"],
|
| 1508 |
BS=Dictionary(
|
| 1509 |
W=0, # Border width: 1 point
|
| 1510 |
S=Name.S # Border style: solid
|
|
|
|
| 1534 |
comprehend_client="",
|
| 1535 |
custom_recogniser_word_list:List[str]=[],
|
| 1536 |
redact_whole_page_list:List[str]=[],
|
| 1537 |
+
max_fuzzy_spelling_mistakes_num:int=1,
|
| 1538 |
+
match_fuzzy_whole_phrase_bool:bool=True,
|
| 1539 |
page_break_val: int = int(page_break_value), # Value for page break
|
| 1540 |
max_time: int = int(max_time_value),
|
| 1541 |
progress: Progress = Progress(track_tqdm=True) # Progress tracking object
|
|
|
|
| 1565 |
- comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
|
| 1566 |
- custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
|
| 1567 |
- redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
|
| 1568 |
+
- max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
|
| 1569 |
+
- match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
|
| 1570 |
- page_break_val: Value for page break
|
| 1571 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
| 1572 |
- progress: Progress tracking object
|
|
|
|
| 1582 |
if custom_recogniser_word_list:
|
| 1583 |
nlp_analyser.registry.remove_recognizer("CUSTOM")
|
| 1584 |
new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
|
|
|
|
| 1585 |
nlp_analyser.registry.add_recognizer(new_custom_recogniser)
|
| 1586 |
|
| 1587 |
+
nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
|
| 1588 |
+
new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
|
| 1589 |
+
nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
|
| 1590 |
+
|
| 1591 |
# List all elements currently in the nlp_analyser registry
|
| 1592 |
#print("Current recognizers in nlp_analyser registry:")
|
| 1593 |
#for recognizer_name in nlp_analyser.registry.recognizers:
|
|
|
|
| 1690 |
language,
|
| 1691 |
chosen_redact_entities,
|
| 1692 |
chosen_redact_comprehend_entities,
|
| 1693 |
+
all_line_level_text_results_list,
|
| 1694 |
all_line_characters,
|
| 1695 |
page_analyser_results,
|
| 1696 |
page_analysed_bounding_boxes,
|
|
|
|
| 1703 |
comprehend_query_number
|
| 1704 |
)
|
| 1705 |
|
|
|
|
| 1706 |
#print("page_analyser_results:", page_analyser_results)
|
| 1707 |
#print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
| 1708 |
#print("image:", image)
|
|
|
|
| 1717 |
# Annotate redactions on page
|
| 1718 |
pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
|
| 1719 |
|
| 1720 |
+
# print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
|
| 1721 |
|
| 1722 |
# Make pymupdf page redactions
|
| 1723 |
#print("redact_whole_page_list:", redact_whole_page_list)
|
tools/helper_functions.py
CHANGED
|
@@ -4,26 +4,12 @@ import boto3
|
|
| 4 |
from botocore.exceptions import ClientError
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
|
|
|
| 7 |
import unicodedata
|
| 8 |
from typing import List
|
| 9 |
from gradio_image_annotation import image_annotator
|
| 10 |
from tools.auth import user_pool_id
|
| 11 |
|
| 12 |
-
def reset_state_vars():
|
| 13 |
-
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
| 14 |
-
label="Modify redaction boxes",
|
| 15 |
-
label_list=["Redaction"],
|
| 16 |
-
label_colors=[(0, 0, 0)],
|
| 17 |
-
show_label=False,
|
| 18 |
-
sources=None,#["upload"],
|
| 19 |
-
show_clear_button=False,
|
| 20 |
-
show_share_button=False,
|
| 21 |
-
show_remove_button=False,
|
| 22 |
-
interactive=False
|
| 23 |
-
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
| 24 |
-
|
| 25 |
-
def reset_review_vars():
|
| 26 |
-
return [], pd.DataFrame(), pd.DataFrame()
|
| 27 |
|
| 28 |
def get_or_create_env_var(var_name, default_value):
|
| 29 |
# Get the environment variable if it exists
|
|
@@ -51,13 +37,40 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
|
|
| 51 |
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
| 52 |
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
def load_in_default_allow_list(allow_list_file_path):
|
| 55 |
if isinstance(allow_list_file_path, str):
|
| 56 |
allow_list_file_path = [allow_list_file_path]
|
| 57 |
return allow_list_file_path
|
| 58 |
|
| 59 |
|
| 60 |
-
def
|
| 61 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
| 62 |
basename = os.path.basename(file_path)
|
| 63 |
|
|
@@ -126,7 +139,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
|
|
| 126 |
if regex_file_names:
|
| 127 |
regex_file_name = regex_file_names[0]
|
| 128 |
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
| 129 |
-
#regex_file_name_no_ext =
|
| 130 |
|
| 131 |
custom_regex.columns = custom_regex.columns.astype(str)
|
| 132 |
|
|
@@ -220,13 +233,41 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
|
|
| 220 |
except Exception as e:
|
| 221 |
print("Could not remove usage logs file", e)
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
-
# Retrieving or setting CUSTOM_HEADER_VALUE
|
| 228 |
-
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
| 229 |
-
print(f'CUSTOM_HEADER_VALUE found')
|
| 230 |
|
| 231 |
async def get_connection_params(request: gr.Request):
|
| 232 |
base_folder = ""
|
|
|
|
| 4 |
from botocore.exceptions import ClientError
|
| 5 |
import gradio as gr
|
| 6 |
import pandas as pd
|
| 7 |
+
import numpy as np
|
| 8 |
import unicodedata
|
| 9 |
from typing import List
|
| 10 |
from gradio_image_annotation import image_annotator
|
| 11 |
from tools.auth import user_pool_id
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def get_or_create_env_var(var_name, default_value):
|
| 15 |
# Get the environment variable if it exists
|
|
|
|
| 37 |
input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
|
| 38 |
print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
|
| 39 |
|
| 40 |
+
# Retrieving or setting CUSTOM_HEADER
|
| 41 |
+
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
| 42 |
+
print(f'CUSTOM_HEADER found')
|
| 43 |
+
|
| 44 |
+
# Retrieving or setting CUSTOM_HEADER_VALUE
|
| 45 |
+
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
| 46 |
+
print(f'CUSTOM_HEADER_VALUE found')
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def reset_state_vars():
|
| 50 |
+
return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
|
| 51 |
+
label="Modify redaction boxes",
|
| 52 |
+
label_list=["Redaction"],
|
| 53 |
+
label_colors=[(0, 0, 0)],
|
| 54 |
+
show_label=False,
|
| 55 |
+
sources=None,#["upload"],
|
| 56 |
+
show_clear_button=False,
|
| 57 |
+
show_share_button=False,
|
| 58 |
+
show_remove_button=False,
|
| 59 |
+
interactive=False
|
| 60 |
+
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
| 61 |
+
|
| 62 |
+
def reset_review_vars():
|
| 63 |
+
return [], pd.DataFrame(), pd.DataFrame()
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
|
| 67 |
def load_in_default_allow_list(allow_list_file_path):
|
| 68 |
if isinstance(allow_list_file_path, str):
|
| 69 |
allow_list_file_path = [allow_list_file_path]
|
| 70 |
return allow_list_file_path
|
| 71 |
|
| 72 |
|
| 73 |
+
def get_file_name_without_type(file_path):
|
| 74 |
# First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
|
| 75 |
basename = os.path.basename(file_path)
|
| 76 |
|
|
|
|
| 139 |
if regex_file_names:
|
| 140 |
regex_file_name = regex_file_names[0]
|
| 141 |
custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
|
| 142 |
+
#regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
|
| 143 |
|
| 144 |
custom_regex.columns = custom_regex.columns.astype(str)
|
| 145 |
|
|
|
|
| 233 |
except Exception as e:
|
| 234 |
print("Could not remove usage logs file", e)
|
| 235 |
|
| 236 |
+
def merge_csv_files(file_list):
|
| 237 |
+
|
| 238 |
+
# Initialise an empty list to hold DataFrames
|
| 239 |
+
dataframes = []
|
| 240 |
+
output_files = []
|
| 241 |
+
|
| 242 |
+
# Loop through each file in the file list
|
| 243 |
+
for file in file_list:
|
| 244 |
+
# Read the CSV file into a DataFrame
|
| 245 |
+
df = pd.read_csv(file.name)
|
| 246 |
+
dataframes.append(df)
|
| 247 |
+
|
| 248 |
+
# Concatenate all DataFrames into a single DataFrame
|
| 249 |
+
merged_df = pd.concat(dataframes, ignore_index=True)
|
| 250 |
+
|
| 251 |
+
for col in ['xmin', 'xmax', 'ymin', 'ymax']:
|
| 252 |
+
merged_df[col] = np.floor(merged_df[col])
|
| 253 |
+
|
| 254 |
+
merged_df = merged_df.drop_duplicates(subset=['page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax'])
|
| 255 |
+
|
| 256 |
+
merged_df = merged_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
| 257 |
+
|
| 258 |
+
file_out_name = os.path.basename(file_list[0])
|
| 259 |
+
|
| 260 |
+
merged_csv_path = output_folder + file_out_name + "_merged.csv"
|
| 261 |
+
|
| 262 |
+
# Save the merged DataFrame to a CSV file
|
| 263 |
+
#merged_csv = StringIO()
|
| 264 |
+
merged_df.to_csv(merged_csv_path, index=False)
|
| 265 |
+
output_files.append(merged_csv_path)
|
| 266 |
+
#merged_csv.seek(0) # Move to the beginning of the StringIO object
|
| 267 |
+
|
| 268 |
+
return output_files
|
| 269 |
+
|
| 270 |
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
async def get_connection_params(request: gr.Request):
|
| 273 |
base_folder = ""
|
tools/load_spacy_model_custom_recognisers.py
CHANGED
|
@@ -3,9 +3,13 @@ from typing import List
|
|
| 3 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
| 4 |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
| 5 |
import spacy
|
|
|
|
|
|
|
| 6 |
spacy.prefer_gpu()
|
| 7 |
from spacy.cli.download import download
|
|
|
|
| 8 |
import re
|
|
|
|
| 9 |
|
| 10 |
model_name = "en_core_web_sm" #"en_core_web_trf"
|
| 11 |
score_threshold = 0.001
|
|
@@ -65,16 +69,8 @@ ukpostcode_pattern = Pattern(
|
|
| 65 |
# Define the recognizer with one or more patterns
|
| 66 |
ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
|
| 67 |
|
| 68 |
-
|
| 69 |
-
# Examples for testing
|
| 70 |
-
|
| 71 |
-
#text = "I live in 510 Broad st SE5 9NG ."
|
| 72 |
-
|
| 73 |
-
#numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
|
| 74 |
-
#print("Result:")
|
| 75 |
-
#print(numbers_result)
|
| 76 |
|
| 77 |
-
# %%
|
| 78 |
def extract_street_name(text:str) -> str:
|
| 79 |
"""
|
| 80 |
Extracts the street name and preceding word (that should contain at least one number) from the given text.
|
|
@@ -101,7 +97,7 @@ def extract_street_name(text:str) -> str:
|
|
| 101 |
pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
|
| 102 |
|
| 103 |
# Find all matches in text
|
| 104 |
-
matches = re.finditer(pattern, text, re.IGNORECASE)
|
| 105 |
|
| 106 |
start_positions = []
|
| 107 |
end_positions = []
|
|
@@ -120,19 +116,6 @@ def extract_street_name(text:str) -> str:
|
|
| 120 |
|
| 121 |
return start_positions, end_positions
|
| 122 |
|
| 123 |
-
|
| 124 |
-
# %%
|
| 125 |
-
# Some examples for testing
|
| 126 |
-
|
| 127 |
-
#text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
|
| 128 |
-
#text = "Roberto lives in Five 10 Broad st in Oregon"
|
| 129 |
-
#text = "Roberto lives in 55 Oregon Square"
|
| 130 |
-
#text = "There is 51a no way I will do that"
|
| 131 |
-
#text = "I am writing to apply for"
|
| 132 |
-
|
| 133 |
-
#extract_street_name(text)
|
| 134 |
-
|
| 135 |
-
# %%
|
| 136 |
class StreetNameRecognizer(EntityRecognizer):
|
| 137 |
|
| 138 |
def load(self) -> None:
|
|
@@ -163,14 +146,181 @@ class StreetNameRecognizer(EntityRecognizer):
|
|
| 163 |
|
| 164 |
street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
# Create a class inheriting from SpacyNlpEngine
|
| 167 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
| 168 |
def __init__(self, loaded_spacy_model):
|
| 169 |
super().__init__()
|
| 170 |
self.nlp = {"en": loaded_spacy_model}
|
| 171 |
|
| 172 |
-
|
| 173 |
-
|
| 174 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
| 175 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
| 176 |
|
|
@@ -186,4 +336,5 @@ nlp_analyser.registry.add_recognizer(street_recogniser)
|
|
| 186 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
| 187 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
| 188 |
nlp_analyser.registry.add_recognizer(custom_recogniser)
|
|
|
|
| 189 |
|
|
|
|
| 3 |
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
|
| 4 |
from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
|
| 5 |
import spacy
|
| 6 |
+
from spacy.matcher import Matcher, PhraseMatcher
|
| 7 |
+
from spaczz.matcher import FuzzyMatcher
|
| 8 |
spacy.prefer_gpu()
|
| 9 |
from spacy.cli.download import download
|
| 10 |
+
import Levenshtein
|
| 11 |
import re
|
| 12 |
+
import gradio as gr
|
| 13 |
|
| 14 |
model_name = "en_core_web_sm" #"en_core_web_trf"
|
| 15 |
score_threshold = 0.001
|
|
|
|
| 69 |
# Define the recognizer with one or more patterns
|
| 70 |
ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
|
| 71 |
|
| 72 |
+
### Street name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
|
|
|
| 74 |
def extract_street_name(text:str) -> str:
|
| 75 |
"""
|
| 76 |
Extracts the street name and preceding word (that should contain at least one number) from the given text.
|
|
|
|
| 97 |
pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
|
| 98 |
|
| 99 |
# Find all matches in text
|
| 100 |
+
matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
|
| 101 |
|
| 102 |
start_positions = []
|
| 103 |
end_positions = []
|
|
|
|
| 116 |
|
| 117 |
return start_positions, end_positions
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
class StreetNameRecognizer(EntityRecognizer):
|
| 120 |
|
| 121 |
def load(self) -> None:
|
|
|
|
| 146 |
|
| 147 |
street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
|
| 148 |
|
| 149 |
+
## Custom fuzzy match recogniser for list of strings
|
| 150 |
+
def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
|
| 151 |
+
# Create regex pattern, handling quotes carefully
|
| 152 |
+
|
| 153 |
+
quote_str = '"'
|
| 154 |
+
replace_str = '(?:"|"|")'
|
| 155 |
+
|
| 156 |
+
custom_regex_pattern = '|'.join(
|
| 157 |
+
rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
|
| 158 |
+
for term in custom_list
|
| 159 |
+
)
|
| 160 |
+
|
| 161 |
+
# Find all matches in text
|
| 162 |
+
matches = re.finditer(custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
|
| 163 |
+
|
| 164 |
+
start_positions = []
|
| 165 |
+
end_positions = []
|
| 166 |
+
|
| 167 |
+
for match in matches:
|
| 168 |
+
start_pos = match.start()
|
| 169 |
+
end_pos = match.end()
|
| 170 |
+
|
| 171 |
+
start_positions.append(start_pos)
|
| 172 |
+
end_positions.append(end_pos)
|
| 173 |
+
|
| 174 |
+
return start_positions, end_positions
|
| 175 |
+
|
| 176 |
+
def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
|
| 177 |
+
''' Conduct fuzzy match on a list of text data.'''
|
| 178 |
+
|
| 179 |
+
all_matches = []
|
| 180 |
+
all_start_positions = []
|
| 181 |
+
all_end_positions = []
|
| 182 |
+
all_ratios = []
|
| 183 |
+
|
| 184 |
+
#print("custom_query_list:", custom_query_list)
|
| 185 |
+
|
| 186 |
+
if not text:
|
| 187 |
+
out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
|
| 188 |
+
print(out_message)
|
| 189 |
+
return out_message, None
|
| 190 |
+
|
| 191 |
+
for string_query in custom_query_list:
|
| 192 |
+
|
| 193 |
+
#print("text:", text)
|
| 194 |
+
#print("string_query:", string_query)
|
| 195 |
+
|
| 196 |
+
query = nlp(string_query)
|
| 197 |
+
|
| 198 |
+
if search_whole_phrase == False:
|
| 199 |
+
# Keep only words that are not stop words
|
| 200 |
+
token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
|
| 201 |
+
|
| 202 |
+
spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
|
| 203 |
+
|
| 204 |
+
#print("token_query:", token_query)
|
| 205 |
+
|
| 206 |
+
if len(token_query) > 1:
|
| 207 |
+
#pattern_lemma = [{"LEMMA": {"IN": query}}]
|
| 208 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
|
| 209 |
+
else:
|
| 210 |
+
#pattern_lemma = [{"LEMMA": query[0]}]
|
| 211 |
+
pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
|
| 212 |
+
|
| 213 |
+
matcher = Matcher(nlp.vocab)
|
| 214 |
+
matcher.add(string_query, [pattern_fuzz])
|
| 215 |
+
#matcher.add(string_query, [pattern_lemma])
|
| 216 |
+
|
| 217 |
+
else:
|
| 218 |
+
# If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
|
| 219 |
+
#tokenised_query = [string_query.lower()]
|
| 220 |
+
# If you want to match the whole phrase, use phrase matcher
|
| 221 |
+
matcher = FuzzyMatcher(nlp.vocab)
|
| 222 |
+
patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
|
| 223 |
+
matcher.add("PHRASE", patterns, [{"ignore_case": True}])
|
| 224 |
+
|
| 225 |
+
batch_size = 256
|
| 226 |
+
docs = nlp.pipe([text], batch_size=batch_size)
|
| 227 |
+
|
| 228 |
+
# Get number of matches per doc
|
| 229 |
+
for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
|
| 230 |
+
matches = matcher(doc)
|
| 231 |
+
match_count = len(matches)
|
| 232 |
+
|
| 233 |
+
# If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
|
| 234 |
+
if search_whole_phrase==False:
|
| 235 |
+
all_matches.append(match_count)
|
| 236 |
+
|
| 237 |
+
for match_id, start, end in matches:
|
| 238 |
+
span = str(doc[start:end]).strip()
|
| 239 |
+
query_search = str(query).strip()
|
| 240 |
+
#print("doc:", doc)
|
| 241 |
+
#print("span:", span)
|
| 242 |
+
#print("query_search:", query_search)
|
| 243 |
+
|
| 244 |
+
# Convert word positions to character positions
|
| 245 |
+
start_char = doc[start].idx # Start character position
|
| 246 |
+
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
| 247 |
+
|
| 248 |
+
# The positions here are word position, not character position
|
| 249 |
+
all_matches.append(match_count)
|
| 250 |
+
all_start_positions.append(start_char)
|
| 251 |
+
all_end_positions.append(end_char)
|
| 252 |
+
|
| 253 |
+
else:
|
| 254 |
+
for match_id, start, end, ratio, pattern in matches:
|
| 255 |
+
span = str(doc[start:end]).strip()
|
| 256 |
+
query_search = str(query).strip()
|
| 257 |
+
print("doc:", doc)
|
| 258 |
+
print("span:", span)
|
| 259 |
+
print("query_search:", query_search)
|
| 260 |
+
|
| 261 |
+
# Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
|
| 262 |
+
distance = Levenshtein.distance(query_search.lower(), span.lower())
|
| 263 |
+
|
| 264 |
+
print("Levenshtein distance:", distance)
|
| 265 |
+
|
| 266 |
+
if distance > spelling_mistakes_max:
|
| 267 |
+
match_count = match_count - 1
|
| 268 |
+
else:
|
| 269 |
+
# Convert word positions to character positions
|
| 270 |
+
start_char = doc[start].idx # Start character position
|
| 271 |
+
end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
|
| 272 |
+
|
| 273 |
+
print("start_char:", start_char)
|
| 274 |
+
print("end_char:", end_char)
|
| 275 |
+
|
| 276 |
+
all_matches.append(match_count)
|
| 277 |
+
all_start_positions.append(start_char)
|
| 278 |
+
all_end_positions.append(end_char)
|
| 279 |
+
all_ratios.append(ratio)
|
| 280 |
+
|
| 281 |
+
|
| 282 |
+
return all_start_positions, all_end_positions
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
class CustomWordFuzzyRecognizer(EntityRecognizer):
|
| 286 |
+
def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
|
| 287 |
+
super().__init__(supported_entities=supported_entities)
|
| 288 |
+
self.custom_list = custom_list # Store the custom_list as an instance attribute
|
| 289 |
+
self.spelling_mistakes_max = spelling_mistakes_max # Store the max spelling mistakes
|
| 290 |
+
self.search_whole_phrase = search_whole_phrase # Store the search whole phrase flag
|
| 291 |
+
|
| 292 |
+
def load(self) -> None:
|
| 293 |
+
"""No loading is required."""
|
| 294 |
+
pass
|
| 295 |
+
|
| 296 |
+
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
|
| 297 |
+
"""
|
| 298 |
+
Logic for detecting a specific PII
|
| 299 |
+
"""
|
| 300 |
+
start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase) # Pass new parameters
|
| 301 |
+
|
| 302 |
+
results = []
|
| 303 |
+
|
| 304 |
+
for i in range(0, len(start_pos)):
|
| 305 |
+
result = RecognizerResult(
|
| 306 |
+
entity_type="CUSTOM_FUZZY",
|
| 307 |
+
start=start_pos[i],
|
| 308 |
+
end=end_pos[i],
|
| 309 |
+
score=1
|
| 310 |
+
)
|
| 311 |
+
results.append(result)
|
| 312 |
+
|
| 313 |
+
return results
|
| 314 |
+
|
| 315 |
+
custom_list_default = []
|
| 316 |
+
custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
|
| 317 |
+
|
| 318 |
# Create a class inheriting from SpacyNlpEngine
|
| 319 |
class LoadedSpacyNlpEngine(SpacyNlpEngine):
|
| 320 |
def __init__(self, loaded_spacy_model):
|
| 321 |
super().__init__()
|
| 322 |
self.nlp = {"en": loaded_spacy_model}
|
| 323 |
|
|
|
|
|
|
|
| 324 |
# Pass the loaded model to the new LoadedSpacyNlpEngine
|
| 325 |
loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
|
| 326 |
|
|
|
|
| 336 |
nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
|
| 337 |
nlp_analyser.registry.add_recognizer(titles_recogniser)
|
| 338 |
nlp_analyser.registry.add_recognizer(custom_recogniser)
|
| 339 |
+
nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
|
| 340 |
|
tools/redaction_review.py
CHANGED
|
@@ -8,7 +8,7 @@ from typing import List
|
|
| 8 |
from gradio_image_annotation import image_annotator
|
| 9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 10 |
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
|
| 11 |
-
from tools.helper_functions import
|
| 12 |
from tools.file_redaction import redact_page_with_pymupdf
|
| 13 |
import json
|
| 14 |
import os
|
|
@@ -68,6 +68,12 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
|
|
| 68 |
for image, items in image_groups.items():
|
| 69 |
# Filter items with non-empty boxes
|
| 70 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
if non_empty_boxes:
|
| 72 |
# Keep the first entry with non-empty boxes
|
| 73 |
result.append(non_empty_boxes[0])
|
|
@@ -175,6 +181,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
| 175 |
|
| 176 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
| 177 |
|
|
|
|
|
|
|
| 178 |
out_image_annotator = image_annotator(
|
| 179 |
value = image_annotator_object[page_num_reported - 1],
|
| 180 |
boxes_alpha=0.1,
|
|
@@ -264,7 +272,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
|
|
| 264 |
|
| 265 |
for file_path in file_paths:
|
| 266 |
#print("file_path:", file_path)
|
| 267 |
-
file_name_without_ext =
|
| 268 |
file_name_with_ext = os.path.basename(file_path)
|
| 269 |
|
| 270 |
file_extension = os.path.splitext(file_path)[1].lower()
|
|
@@ -544,7 +552,7 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
|
|
| 544 |
else:
|
| 545 |
file_path = file.name
|
| 546 |
|
| 547 |
-
file_path_name =
|
| 548 |
file_path_end = detect_file_type(file_path)
|
| 549 |
|
| 550 |
if file_path_end == "pdf":
|
|
@@ -675,7 +683,7 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
|
|
| 675 |
else:
|
| 676 |
file_path = file.name
|
| 677 |
|
| 678 |
-
file_path_name =
|
| 679 |
file_path_end = detect_file_type(file_path)
|
| 680 |
|
| 681 |
if file_path_end == "pdf":
|
|
@@ -699,7 +707,7 @@ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
|
|
| 699 |
# else:
|
| 700 |
# xfdf_path = xfdf_paths[0].name
|
| 701 |
|
| 702 |
-
file_path_name =
|
| 703 |
|
| 704 |
#print("file_path_name:", file_path_name)
|
| 705 |
|
|
|
|
| 8 |
from gradio_image_annotation import image_annotator
|
| 9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
| 10 |
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
|
| 11 |
+
from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
|
| 12 |
from tools.file_redaction import redact_page_with_pymupdf
|
| 13 |
import json
|
| 14 |
import os
|
|
|
|
| 68 |
for image, items in image_groups.items():
|
| 69 |
# Filter items with non-empty boxes
|
| 70 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
| 71 |
+
|
| 72 |
+
# Remove 'text' elements from boxes
|
| 73 |
+
for item in non_empty_boxes:
|
| 74 |
+
if 'boxes' in item:
|
| 75 |
+
item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
|
| 76 |
+
|
| 77 |
if non_empty_boxes:
|
| 78 |
# Keep the first entry with non-empty boxes
|
| 79 |
result.append(non_empty_boxes[0])
|
|
|
|
| 181 |
|
| 182 |
image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
|
| 183 |
|
| 184 |
+
|
| 185 |
+
|
| 186 |
out_image_annotator = image_annotator(
|
| 187 |
value = image_annotator_object[page_num_reported - 1],
|
| 188 |
boxes_alpha=0.1,
|
|
|
|
| 272 |
|
| 273 |
for file_path in file_paths:
|
| 274 |
#print("file_path:", file_path)
|
| 275 |
+
file_name_without_ext = get_file_name_without_type(file_path)
|
| 276 |
file_name_with_ext = os.path.basename(file_path)
|
| 277 |
|
| 278 |
file_extension = os.path.splitext(file_path)[1].lower()
|
|
|
|
| 552 |
else:
|
| 553 |
file_path = file.name
|
| 554 |
|
| 555 |
+
file_path_name = get_file_name_without_type(file_path)
|
| 556 |
file_path_end = detect_file_type(file_path)
|
| 557 |
|
| 558 |
if file_path_end == "pdf":
|
|
|
|
| 683 |
else:
|
| 684 |
file_path = file.name
|
| 685 |
|
| 686 |
+
file_path_name = get_file_name_without_type(file_path)
|
| 687 |
file_path_end = detect_file_type(file_path)
|
| 688 |
|
| 689 |
if file_path_end == "pdf":
|
|
|
|
| 707 |
# else:
|
| 708 |
# xfdf_path = xfdf_paths[0].name
|
| 709 |
|
| 710 |
+
file_path_name = get_file_name_without_type(xfdf_path)
|
| 711 |
|
| 712 |
#print("file_path_name:", file_path_name)
|
| 713 |
|