seanpedrickcase commited on
Commit
bce761b
·
1 Parent(s): 9d7cf92

Added possibility of changing model and entity types in config file

Browse files
app.py CHANGED
@@ -1,12 +1,10 @@
1
  import os
2
- import logging
3
  import pandas as pd
4
  import gradio as gr
5
  from gradio_image_annotation import image_annotator
6
-
7
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, TESSERACT_FOLDER, POPPLER_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, AWS_REGION
8
- from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
9
- from tools.aws_functions import upload_file_to_s3, download_file_from_s3, upload_log_file_to_s3
10
  from tools.file_redaction import choose_and_run_redactor
11
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
12
  from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
@@ -20,30 +18,7 @@ from tools.textract_batch_call import analyse_document_with_textract_api, poll_w
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
22
 
23
- chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
24
-
25
- full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
26
-
27
- # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
28
- chosen_comprehend_entities.extend(custom_entities)
29
- full_comprehend_entity_list.extend(custom_entities)
30
-
31
- # Entities for local PII redaction option
32
- chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
33
-
34
- full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
35
-
36
- log_file_name = 'log.csv'
37
-
38
- file_input_height = 200
39
-
40
- if RUN_AWS_FUNCTIONS == "1":
41
- default_ocr_val = textract_option
42
- default_pii_detector = local_pii_detector
43
- else:
44
- default_ocr_val = text_ocr_option
45
- default_pii_detector = local_pii_detector
46
-
47
  SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
48
  SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
49
 
@@ -55,6 +30,17 @@ if DYNAMODB_ACCESS_LOG_HEADERS: DYNAMODB_ACCESS_LOG_HEADERS = eval(DYNAMODB_ACCE
55
  if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
56
  if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
57
 
 
 
 
 
 
 
 
 
 
 
 
58
  # Create the gradio interface
59
  app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
60
 
@@ -66,8 +52,7 @@ with app:
66
 
67
  # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
68
  pdf_doc_state = gr.State([])
69
- all_image_annotations_state = gr.State([])
70
-
71
 
72
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
73
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
@@ -105,11 +90,11 @@ with app:
105
  backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
106
 
107
  # Logging state
108
- feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + log_file_name, visible=False)
109
  feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
110
- access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + log_file_name, visible=False)
111
  access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
112
- usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + log_file_name, visible=False)
113
  usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
114
 
115
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
@@ -172,8 +157,8 @@ with app:
172
  s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
173
  s3_whole_document_textract_output_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
174
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
175
- no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
176
- textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
177
 
178
  load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
179
  s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
@@ -233,7 +218,7 @@ with app:
233
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
234
  job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
235
 
236
- textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
237
  convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
238
 
239
  ###
@@ -256,15 +241,15 @@ with app:
256
  ###
257
  with gr.Tab("Redact PDFs/images"):
258
  with gr.Accordion("Redact document", open = True):
259
- in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
260
 
261
- text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
262
 
263
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
264
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
265
 
266
  with gr.Row(equal_height=True):
267
- pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = default_pii_detector, choices=[no_redaction_option, local_pii_detector, aws_pii_detector])
268
 
269
  if SHOW_COSTS == "True":
270
  with gr.Accordion("Estimated costs and time taken. Note that costs shown only include direct usage of AWS services and do not include other running costs (e.g. storage, run-time costs)", open = True, visible=True):
@@ -311,7 +296,7 @@ with app:
311
 
312
  with gr.Row():
313
  redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
314
- output_file = gr.File(label="Output files", scale = 2)#, height=file_input_height)
315
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
316
 
317
  # Feedback elements are invisible until revealed by redaction action
@@ -326,7 +311,7 @@ with app:
326
  with gr.Tab("Review redactions", id="tab_object_annotation"):
327
 
328
  with gr.Accordion(label = "Review PDF redactions", open=True):
329
- output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions. The 'ocr_output' file can also be optionally provided for text search.", file_count='multiple', height=file_input_height)
330
  upload_previous_review_file_btn = gr.Button("Review redactions based on original PDF and 'review_file' csv provided above ('ocr_output' csv optional)", variant="secondary")
331
  with gr.Row():
332
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
@@ -376,20 +361,18 @@ with app:
376
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
377
  page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
378
  text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
 
379
  recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=(4,"fixed"), type="pandas", label="Click table row to select and go to page", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
380
 
381
  with gr.Row(equal_height=True):
382
  exclude_selected_btn = gr.Button(value="Exclude all redactions in table")
383
 
384
  with gr.Accordion("Selected redaction row", open=True):
385
- selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=True, headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True)
386
  exclude_selected_row_btn = gr.Button(value="Exclude specific redaction row")
387
- exclude_text_with_same_as_selected_row_btn = gr.Button(value="Exclude all redactions with same text as selected row")
388
-
389
- with gr.Row(equal_height=True):
390
- reset_dropdowns_btn = gr.Button(value="Reset filters")
391
 
392
- undo_last_removal_btn = gr.Button(value="Undo last element removal")
393
 
394
  with gr.Accordion("Search all extracted text", open=True):
395
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
@@ -405,12 +388,12 @@ with app:
405
  ###
406
  with gr.Tab(label="Identify duplicate pages"):
407
  with gr.Accordion("Identify duplicate pages to redact", open = True):
408
- in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
409
  with gr.Row():
410
  duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
411
  find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
412
 
413
- duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
414
 
415
  ###
416
  # TEXT / TABULAR DATA TAB
@@ -420,13 +403,13 @@ with app:
420
  with gr.Accordion("Redact open text", open = False):
421
  in_text = gr.Textbox(label="Enter open text", lines=10)
422
  with gr.Accordion("Upload xlsx or csv files", open = True):
423
- in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'], height=file_input_height)
424
 
425
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
426
 
427
  in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
428
 
429
- pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
430
 
431
  with gr.Accordion("Anonymisation output format", open = False):
432
  anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = "replace with 'REDACTED'") # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
@@ -452,13 +435,13 @@ with app:
452
  with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
453
  with gr.Row():
454
  with gr.Column():
455
- in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.", file_count="multiple", height=file_input_height)
456
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
457
  with gr.Column():
458
- in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.", file_count="multiple", height=file_input_height)
459
  in_deny_list_text = gr.Textbox(label="Custom deny list load status")
460
  with gr.Column():
461
- in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
462
  in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
463
  with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
464
  with gr.Row():
@@ -467,8 +450,8 @@ with app:
467
  in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
468
 
469
  with gr.Accordion("Select entity types to redact", open = True):
470
- in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
471
- in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
472
 
473
  with gr.Row():
474
  max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
@@ -767,7 +750,7 @@ with app:
767
 
768
  ### ACCESS LOGS
769
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
770
- access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
771
  access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
772
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
773
  success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
@@ -775,25 +758,25 @@ with app:
775
  ### FEEDBACK LOGS
776
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
777
  # User submitted feedback for pdf redactions
778
- pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
779
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
780
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
781
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
782
 
783
  # User submitted feedback for data redactions
784
- data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
785
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
786
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
787
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
788
  else:
789
  # User submitted feedback for pdf redactions
790
- pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
791
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
792
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
793
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
794
 
795
  # User submitted feedback for data redactions
796
- data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
797
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
798
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
799
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
@@ -801,7 +784,7 @@ with app:
801
  ### USAGE LOGS
802
  # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
803
 
804
- usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
805
 
806
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
807
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
@@ -839,7 +822,7 @@ if __name__ == "__main__":
839
 
840
  main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
841
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
842
- current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),chosen_comprehend_entities = chosen_comprehend_entities, chosen_redact_entities = chosen_redact_entities, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
843
 
844
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
845
  # with gr.Tab(label="Advanced options"):
 
1
  import os
 
2
  import pandas as pd
3
  import gradio as gr
4
  from gradio_image_annotation import image_annotator
5
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, ALLOW_LIST_PATH, S3_ALLOW_LIST_PATH, FEEDBACK_LOGS_FOLDER, ACCESS_LOGS_FOLDER, USAGE_LOGS_FOLDER, REDACTION_LANGUAGE, GET_COST_CODES, COST_CODES_PATH, S3_COST_CODES_PATH, ENFORCE_COST_CODES, DISPLAY_FILE_NAMES_IN_LOGS, SHOW_COSTS, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET, SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, SESSION_OUTPUT_FOLDER, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, HOST_NAME, DEFAULT_COST_CODE, OUTPUT_COST_CODES_PATH, OUTPUT_ALLOW_LIST_PATH, COGNITO_AUTH, SAVE_LOGS_TO_CSV, SAVE_LOGS_TO_DYNAMODB, ACCESS_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_ACCESS_LOG_HEADERS, CSV_ACCESS_LOG_HEADERS, FEEDBACK_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_FEEDBACK_LOG_HEADERS, CSV_FEEDBACK_LOG_HEADERS, USAGE_LOG_DYNAMODB_TABLE_NAME, DYNAMODB_USAGE_LOG_HEADERS, CSV_USAGE_LOG_HEADERS, TEXTRACT_JOBS_S3_INPUT_LOC, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, TEXT_EXTRACTION_MODELS, PII_DETECTION_MODELS, DEFAULT_TEXT_EXTRACTION_MODEL, DEFAULT_PII_DETECTION_MODEL, LOG_FILE_NAME, CHOSEN_COMPREHEND_ENTITIES, FULL_COMPREHEND_ENTITY_LIST, CHOSEN_REDACT_ENTITIES, FULL_ENTITY_LIST, FILE_INPUT_HEIGHT, TABULAR_PII_DETECTION_MODELS
6
+ from tools.helper_functions import put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, reset_review_vars, merge_csv_files, load_all_output_files, update_dataframe, check_for_existing_textract_file, load_in_default_cost_codes, enforce_cost_codes, calculate_aws_costs, calculate_time_taken, reset_base_dataframe, reset_ocr_base_dataframe, update_cost_code_dataframe_from_dropdown_select, check_for_existing_local_ocr_file, reset_data_vars, reset_aws_call_vars
7
+ from tools.aws_functions import download_file_from_s3, upload_log_file_to_s3
 
8
  from tools.file_redaction import choose_and_run_redactor
9
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
10
  from tools.redaction_review import apply_redactions_to_review_df_and_files, update_all_page_annotation_object_based_on_previous_page, decrease_page, increase_page, update_annotator_object_and_filter_df, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback_dataframe_row, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal, update_selected_review_df_row_colour, update_all_entity_df_dropdowns, df_select_callback_cost, update_other_annotator_number_from_current, update_annotator_page_from_review_df, df_select_callback_ocr, df_select_callback_textract_api, get_all_rows_with_same_text
 
18
  # Suppress downcasting warnings
19
  pd.set_option('future.no_silent_downcasting', True)
20
 
21
+ # Convert string environment variables to string or list
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  SAVE_LOGS_TO_CSV = eval(SAVE_LOGS_TO_CSV)
23
  SAVE_LOGS_TO_DYNAMODB = eval(SAVE_LOGS_TO_DYNAMODB)
24
 
 
30
  if DYNAMODB_FEEDBACK_LOG_HEADERS: DYNAMODB_FEEDBACK_LOG_HEADERS = eval(DYNAMODB_FEEDBACK_LOG_HEADERS)
31
  if DYNAMODB_USAGE_LOG_HEADERS: DYNAMODB_USAGE_LOG_HEADERS = eval(DYNAMODB_USAGE_LOG_HEADERS)
32
 
33
+ if CHOSEN_COMPREHEND_ENTITIES: CHOSEN_COMPREHEND_ENTITIES = eval(CHOSEN_COMPREHEND_ENTITIES)
34
+ if FULL_COMPREHEND_ENTITY_LIST: FULL_COMPREHEND_ENTITY_LIST = eval(FULL_COMPREHEND_ENTITY_LIST)
35
+ if CHOSEN_REDACT_ENTITIES: CHOSEN_REDACT_ENTITIES = eval(CHOSEN_REDACT_ENTITIES)
36
+ if FULL_ENTITY_LIST: FULL_ENTITY_LIST = eval(FULL_ENTITY_LIST)
37
+
38
+ # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
39
+ CHOSEN_COMPREHEND_ENTITIES.extend(custom_entities)
40
+ FULL_COMPREHEND_ENTITY_LIST.extend(custom_entities)
41
+
42
+ FILE_INPUT_HEIGHT = int(FILE_INPUT_HEIGHT)
43
+
44
  # Create the gradio interface
45
  app = gr.Blocks(theme = gr.themes.Base(), fill_width=True)
46
 
 
52
 
53
  # Pymupdf doc and all image annotations objects need to be stored as State objects as they do not have a standard Gradio component equivalent
54
  pdf_doc_state = gr.State([])
55
+ all_image_annotations_state = gr.State([])
 
56
 
57
  all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas", wrap=True)
58
  review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas", wrap=True)
 
90
  backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
91
 
92
  # Logging state
93
+ feedback_logs_state = gr.Textbox(label= "feedback_logs_state", value=FEEDBACK_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
94
  feedback_s3_logs_loc_state = gr.Textbox(label= "feedback_s3_logs_loc_state", value=FEEDBACK_LOGS_FOLDER, visible=False)
95
+ access_logs_state = gr.Textbox(label= "access_logs_state", value=ACCESS_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
96
  access_s3_logs_loc_state = gr.Textbox(label= "access_s3_logs_loc_state", value=ACCESS_LOGS_FOLDER, visible=False)
97
+ usage_logs_state = gr.Textbox(label= "usage_logs_state", value=USAGE_LOGS_FOLDER + LOG_FILE_NAME, visible=False)
98
  usage_s3_logs_loc_state = gr.Textbox(label= "usage_s3_logs_loc_state", value=USAGE_LOGS_FOLDER, visible=False)
99
 
100
  session_hash_textbox = gr.Textbox(label= "session_hash_textbox", value="", visible=False)
 
157
  s3_whole_document_textract_input_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, visible=False)
158
  s3_whole_document_textract_output_subfolder = gr.Textbox(label = "Default Textract whole_document S3 output folder", value=TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
159
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
160
+ no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = NO_REDACTION_PII_OPTION, choices=[NO_REDACTION_PII_OPTION], visible=False)
161
+ textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = TEXTRACT_TEXT_EXTRACT_OPTION, choices=[TEXTRACT_TEXT_EXTRACT_OPTION], visible=False)
162
 
163
  load_s3_whole_document_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
164
  s3_whole_document_textract_logs_subfolder = gr.Textbox(label = "Default Textract whole_document S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
 
218
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
219
  job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
220
 
221
+ textract_job_output_file = gr.File(label="Textract job output files", height=FILE_INPUT_HEIGHT, visible=False)
222
  convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
223
 
224
  ###
 
241
  ###
242
  with gr.Tab("Redact PDFs/images"):
243
  with gr.Accordion("Redact document", open = True):
244
+ in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=FILE_INPUT_HEIGHT)
245
 
246
+ text_extract_method_radio = gr.Radio(label="""Choose text extraction method. Local options are lower quality but cost nothing - they may be worth a try if you are willing to spend some time reviewing outputs. AWS Textract has a cost per page - £2.66 ($3.50) per 1,000 pages with signature detection (default), £1.14 ($1.50) without. Change the settings in the tab below (AWS Textract signature detection) to change this.""", value = DEFAULT_TEXT_EXTRACTION_MODEL, choices=TEXT_EXTRACTION_MODELS)
247
 
248
  with gr.Accordion("AWS Textract signature detection (default is on)", open = False):
249
  handwrite_signature_checkbox = gr.CheckboxGroup(label="AWS Textract extraction settings", choices=["Extract handwriting", "Extract signatures"], value=["Extract handwriting", "Extract signatures"])
250
 
251
  with gr.Row(equal_height=True):
252
+ pii_identification_method_drop = gr.Radio(label = """Choose personal information detection method. The local model is lower quality but costs nothing - it may be worth a try if you are willing to spend some time reviewing outputs, or if you are only interested in searching for custom search terms (see Redaction settings - custom deny list). AWS Comprehend has a cost of around £0.0075 ($0.01) per 10,000 characters.""", value = DEFAULT_PII_DETECTION_MODEL, choices=PII_DETECTION_MODELS)
253
 
254
  if SHOW_COSTS == "True":
255
  with gr.Accordion("Estimated costs and time taken. Note that costs shown only include direct usage of AWS services and do not include other running costs (e.g. storage, run-time costs)", open = True, visible=True):
 
296
 
297
  with gr.Row():
298
  redaction_output_summary_textbox = gr.Textbox(label="Output summary", scale=1)
299
+ output_file = gr.File(label="Output files", scale = 2)#, height=FILE_INPUT_HEIGHT)
300
  latest_file_completed_text = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
301
 
302
  # Feedback elements are invisible until revealed by redaction action
 
311
  with gr.Tab("Review redactions", id="tab_object_annotation"):
312
 
313
  with gr.Accordion(label = "Review PDF redactions", open=True):
314
+ output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions. The 'ocr_output' file can also be optionally provided for text search.", file_count='multiple', height=FILE_INPUT_HEIGHT)
315
  upload_previous_review_file_btn = gr.Button("Review redactions based on original PDF and 'review_file' csv provided above ('ocr_output' csv optional)", variant="secondary")
316
  with gr.Row():
317
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
 
361
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
362
  page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
363
  text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
364
+ reset_dropdowns_btn = gr.Button(value="Reset filters")
365
  recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=(4,"fixed"), type="pandas", label="Click table row to select and go to page", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
366
 
367
  with gr.Row(equal_height=True):
368
  exclude_selected_btn = gr.Button(value="Exclude all redactions in table")
369
 
370
  with gr.Accordion("Selected redaction row", open=True):
371
+ selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=True, headers=["page", "label", "text", "id"], wrap=True)
372
  exclude_selected_row_btn = gr.Button(value="Exclude specific redaction row")
373
+ exclude_text_with_same_as_selected_row_btn = gr.Button(value="Exclude all redactions with same text as selected row")
 
 
 
374
 
375
+ undo_last_removal_btn = gr.Button(value="Undo last element removal", variant="primary")
376
 
377
  with gr.Accordion("Search all extracted text", open=True):
378
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
 
388
  ###
389
  with gr.Tab(label="Identify duplicate pages"):
390
  with gr.Accordion("Identify duplicate pages to redact", open = True):
391
+ in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
392
  with gr.Row():
393
  duplicate_threshold_value = gr.Number(value=0.9, label="Minimum similarity to be considered a duplicate (maximum = 1)", scale =1)
394
  find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary", scale = 4)
395
 
396
+ duplicate_pages_out = gr.File(label="Duplicate pages analysis output", file_count="multiple", height=FILE_INPUT_HEIGHT, file_types=['.csv'])
397
 
398
  ###
399
  # TEXT / TABULAR DATA TAB
 
403
  with gr.Accordion("Redact open text", open = False):
404
  in_text = gr.Textbox(label="Enter open text", lines=10)
405
  with gr.Accordion("Upload xlsx or csv files", open = True):
406
+ in_data_files = gr.File(label="Choose Excel or csv files", file_count= "multiple", file_types=['.xlsx', '.xls', '.csv', '.parquet', '.csv.gz'], height=FILE_INPUT_HEIGHT)
407
 
408
  in_excel_sheets = gr.Dropdown(choices=["Choose Excel sheets to anonymise"], multiselect = True, label="Select Excel sheets that you want to anonymise (showing sheets present across all Excel files).", visible=False, allow_custom_value=True)
409
 
410
  in_colnames = gr.Dropdown(choices=["Choose columns to anonymise"], multiselect = True, label="Select columns that you want to anonymise (showing columns present across all files).")
411
 
412
+ pii_identification_method_drop_tabular = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = DEFAULT_PII_DETECTION_MODEL, choices=TABULAR_PII_DETECTION_MODELS)
413
 
414
  with gr.Accordion("Anonymisation output format", open = False):
415
  anon_strat = gr.Radio(choices=["replace with 'REDACTED'", "replace with <ENTITY_NAME>", "redact completely", "hash", "mask"], label="Select an anonymisation method.", value = "replace with 'REDACTED'") # , "encrypt", "fake_first_name" are also available, but are not currently included as not that useful in current form
 
435
  with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
436
  with gr.Row():
437
  with gr.Column():
438
+ in_allow_list = gr.File(label="Import allow list file - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will not be redacted.", file_count="multiple", height=FILE_INPUT_HEIGHT)
439
  in_allow_list_text = gr.Textbox(label="Custom allow list load status")
440
  with gr.Column():
441
+ in_deny_list = gr.File(label="Import custom deny list - csv table with one column of a different word/phrase on each row (case insensitive). Terms in this file will always be redacted.", file_count="multiple", height=FILE_INPUT_HEIGHT)
442
  in_deny_list_text = gr.Textbox(label="Custom deny list load status")
443
  with gr.Column():
444
+ in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=FILE_INPUT_HEIGHT)
445
  in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
446
  with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists (NOTE: you need to press Enter after modifying/adding an entry to the lists to apply them)", open = False):
447
  with gr.Row():
 
450
  in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=["fully_redacted_pages_list"], col_count=(1, "fixed"), row_count = (0, "dynamic"), label="Fully redacted pages", visible=True, type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, datatype='number', wrap=True)
451
 
452
  with gr.Accordion("Select entity types to redact", open = True):
453
+ in_redact_entities = gr.Dropdown(value=CHOSEN_REDACT_ENTITIES, choices=FULL_ENTITY_LIST, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
454
+ in_redact_comprehend_entities = gr.Dropdown(value=CHOSEN_COMPREHEND_ENTITIES, choices=FULL_COMPREHEND_ENTITY_LIST, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
455
 
456
  with gr.Row():
457
  max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
 
750
 
751
  ### ACCESS LOGS
752
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
753
+ access_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
754
  access_callback.setup([session_hash_textbox, host_name_textbox], ACCESS_LOGS_FOLDER)
755
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=ACCESS_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_ACCESS_LOG_HEADERS, replacement_headers=CSV_ACCESS_LOG_HEADERS), [session_hash_textbox, host_name_textbox], None, preprocess=False).\
756
  success(fn = upload_log_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 
758
  ### FEEDBACK LOGS
759
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
760
  # User submitted feedback for pdf redactions
761
+ pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
762
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
763
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
764
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
765
 
766
  # User submitted feedback for data redactions
767
+ data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
768
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
769
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
770
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
771
  else:
772
  # User submitted feedback for pdf redactions
773
+ pdf_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
774
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], FEEDBACK_LOGS_FOLDER)
775
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [pdf_feedback_radio, pdf_further_details_text, placeholder_doc_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
776
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
777
 
778
  # User submitted feedback for data redactions
779
+ data_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
780
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], FEEDBACK_LOGS_FOLDER)
781
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args), save_to_csv=SAVE_LOGS_TO_CSV, save_to_dynamodb=SAVE_LOGS_TO_DYNAMODB, dynamodb_table_name=FEEDBACK_LOG_DYNAMODB_TABLE_NAME, dynamodb_headers=DYNAMODB_FEEDBACK_LOG_HEADERS, replacement_headers=CSV_FEEDBACK_LOG_HEADERS), [data_feedback_radio, data_further_details_text, placeholder_data_file_name_no_extension_textbox_for_logs], None, preprocess=False).\
782
  success(fn = upload_log_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
 
784
  ### USAGE LOGS
785
  # Log processing usage - time taken for redaction queries, and also logs for queries to Textract/Comprehend
786
 
787
+ usage_callback = CSVLogger_custom(dataset_file_name=LOG_FILE_NAME)
788
 
789
  if DISPLAY_FILE_NAMES_IN_LOGS == 'True':
790
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, total_pdf_page_count, actual_time_taken_number, textract_query_number, pii_identification_method_drop, comprehend_query_number, cost_code_choice_drop, handwrite_signature_checkbox, host_name_textbox, text_extract_method_radio, is_a_textract_api_call], USAGE_LOGS_FOLDER)
 
822
 
823
  main(first_loop_state, latest_file_completed=0, redaction_output_summary_textbox="", output_file_list=None,
824
  log_files_list=None, estimated_time=0, textract_metadata="", comprehend_query_num=0,
825
+ current_loop_page=0, page_break=False, pdf_doc_state = [], all_image_annotations = [], all_line_level_ocr_results_df = pd.DataFrame(), all_decision_process_table = pd.DataFrame(),CHOSEN_COMPREHEND_ENTITIES = CHOSEN_COMPREHEND_ENTITIES, CHOSEN_REDACT_ENTITIES = CHOSEN_REDACT_ENTITIES, handwrite_signature_checkbox = ["Extract handwriting", "Extract signatures"])
826
 
827
  # AWS options - placeholder for possibility of storing data on s3 and retrieving it in app
828
  # with gr.Tab(label="Advanced options"):
tools/config.py CHANGED
@@ -204,7 +204,7 @@ if LOGGING == 'True':
204
  # Configure logging
205
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
206
 
207
-
208
 
209
 
210
  ###
@@ -218,6 +218,80 @@ POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on
218
  if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
219
  if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
223
  PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
@@ -232,9 +306,6 @@ RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION
232
 
233
  COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
234
 
235
-
236
-
237
-
238
  ###
239
  # APP RUN OPTIONS
240
  ###
@@ -269,7 +340,7 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
269
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
270
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
271
 
272
-
273
 
274
 
275
  ###
 
204
  # Configure logging
205
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
206
 
207
+ LOG_FILE_NAME = get_or_create_env_var('LOG_FILE_NAME', 'log.csv')
208
 
209
 
210
  ###
 
218
  if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
219
  if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
220
 
221
+ # List of models to use for text extraction and PII detection
222
+ # Text extraction models
223
+ SELECTABLE_TEXT_EXTRACT_OPTION = get_or_create_env_var('SELECTABLE_TEXT_EXTRACT_OPTION', "Local model - selectable text")
224
+ TESSERACT_TEXT_EXTRACT_OPTION = get_or_create_env_var('TESSERACT_TEXT_EXTRACT_OPTION', "Local OCR model - PDFs without selectable text")
225
+ TEXTRACT_TEXT_EXTRACT_OPTION = get_or_create_env_var('TEXTRACT_TEXT_EXTRACT_OPTION', "AWS Textract service - all PDF types")
226
+
227
+ # PII detection models
228
+ NO_REDACTION_PII_OPTION = get_or_create_env_var('NO_REDACTION_PII_OPTION', "Only extract text (no redaction)")
229
+ LOCAL_PII_OPTION = get_or_create_env_var('LOCAL_PII_OPTION', "Local")
230
+ AWS_PII_OPTION = get_or_create_env_var('AWS_PII_OPTION', "AWS Comprehend")
231
+
232
+ SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS', 'True')
233
+ SHOW_AWS_TEXT_EXTRACTION_OPTIONS = get_or_create_env_var('SHOW_AWS_TEXT_EXTRACTION_OPTIONS', 'True')
234
+
235
+ # Show at least local options if everything mistakenly removed
236
+ if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS != "True" and SHOW_AWS_TEXT_EXTRACTION_OPTIONS != "True":
237
+ SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS = "True"
238
+
239
+ local_model_options = []
240
+ aws_model_options = []
241
+ text_extraction_models = []
242
+
243
+ if SHOW_LOCAL_TEXT_EXTRACTION_OPTIONS == 'True':
244
+ local_model_options.append(SELECTABLE_TEXT_EXTRACT_OPTION)
245
+ local_model_options.append(TESSERACT_TEXT_EXTRACT_OPTION)
246
+
247
+ if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == 'True':
248
+ aws_model_options.append(TEXTRACT_TEXT_EXTRACT_OPTION)
249
+
250
+ TEXT_EXTRACTION_MODELS = local_model_options + aws_model_options
251
+
252
+ SHOW_LOCAL_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_LOCAL_PII_DETECTION_OPTIONS', 'True')
253
+ SHOW_AWS_PII_DETECTION_OPTIONS = get_or_create_env_var('SHOW_AWS_PII_DETECTION_OPTIONS', 'True')
254
+
255
+ if SHOW_LOCAL_PII_DETECTION_OPTIONS != "True" and SHOW_AWS_PII_DETECTION_OPTIONS != "True":
256
+ SHOW_LOCAL_PII_DETECTION_OPTIONS = "True"
257
+
258
+ local_model_options = [NO_REDACTION_PII_OPTION]
259
+ aws_model_options = []
260
+ pii_detection_models = []
261
+
262
+ if SHOW_LOCAL_PII_DETECTION_OPTIONS == 'True':
263
+ local_model_options.append(LOCAL_PII_OPTION)
264
+
265
+ if SHOW_AWS_PII_DETECTION_OPTIONS == 'True':
266
+ aws_model_options.append(AWS_PII_OPTION)
267
+
268
+ PII_DETECTION_MODELS = local_model_options + aws_model_options
269
+
270
+ if SHOW_AWS_TEXT_EXTRACTION_OPTIONS == "True":
271
+ DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var('DEFAULT_TEXT_EXTRACTION_MODEL', TEXTRACT_TEXT_EXTRACT_OPTION)
272
+ else:
273
+ DEFAULT_TEXT_EXTRACTION_MODEL = get_or_create_env_var('DEFAULT_TEXT_EXTRACTION_MODEL', SELECTABLE_TEXT_EXTRACT_OPTION)
274
+
275
+ if SHOW_AWS_PII_DETECTION_OPTIONS == "True":
276
+ DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var('DEFAULT_PII_DETECTION_MODEL', AWS_PII_OPTION)
277
+ else:
278
+ DEFAULT_PII_DETECTION_MODEL = get_or_create_env_var('DEFAULT_PII_DETECTION_MODEL', LOCAL_PII_OPTION)
279
+
280
+ # Create list of PII detection models for tabular redaction
281
+ TABULAR_PII_DETECTION_MODELS = PII_DETECTION_MODELS.copy()
282
+ if NO_REDACTION_PII_OPTION in TABULAR_PII_DETECTION_MODELS:
283
+ TABULAR_PII_DETECTION_MODELS.remove(NO_REDACTION_PII_OPTION)
284
+
285
+ # Entities for redaction
286
+ CHOSEN_COMPREHEND_ENTITIES = get_or_create_env_var('CHOSEN_COMPREHEND_ENTITIES', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']")
287
+
288
+ FULL_COMPREHEND_ENTITY_LIST = get_or_create_env_var('FULL_COMPREHEND_ENTITY_LIST', "['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', 'CUSTOM_FUZZY']")
289
+
290
+ # Entities for local PII redaction option
291
+ CHOSEN_REDACT_ENTITIES = get_or_create_env_var('CHOSEN_REDACT_ENTITIES', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CUSTOM']")
292
+
293
+ FULL_ENTITY_LIST = get_or_create_env_var('FULL_ENTITY_LIST', "['TITLES', 'PERSON', 'PHONE_NUMBER', 'EMAIL_ADDRESS', 'STREETNAME', 'UKPOSTCODE', 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']")
294
+
295
 
296
  # Number of pages to loop through before breaking the function and restarting from the last finished page (not currently activated).
297
  PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
 
306
 
307
  COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
308
 
 
 
 
309
  ###
310
  # APP RUN OPTIONS
311
  ###
 
340
  if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
341
  else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
342
 
343
+ FILE_INPUT_HEIGHT = get_or_create_env_var('FILE_INPUT_HEIGHT', '200')
344
 
345
 
346
  ###
tools/data_anonymise.py CHANGED
@@ -6,20 +6,16 @@ import time
6
  import boto3
7
  import botocore
8
  import pandas as pd
9
- from openpyxl import Workbook, load_workbook
10
-
11
  from faker import Faker
12
  from gradio import Progress
13
  from typing import List, Dict, Any
14
-
15
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
16
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
17
- from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
18
-
19
  from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
20
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
21
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
22
- from tools.custom_image_analyser_engine import do_aws_comprehend_call
23
  # Use custom version of analyze_dict to be able to track progress
24
  from tools.presidio_analyzer_custom import analyze_dict
25
 
@@ -28,7 +24,7 @@ fake = Faker("en_UK")
28
  def fake_first_name(x):
29
  return fake.first_name()
30
 
31
- def initial_clean(text):
32
  #### Some of my cleaning functions
33
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
34
  html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
@@ -49,7 +45,7 @@ def initial_clean(text):
49
 
50
  return text
51
 
52
- def process_recognizer_result(result, recognizer_result, data_row, dictionary_key, df_dict, keys_to_keep):
53
  output = []
54
 
55
  if hasattr(result, 'value'):
@@ -115,7 +111,7 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
115
 
116
  return decision_process_output_str
117
 
118
- def anon_consistent_names(df):
119
  # ## Pick out common names and replace them with the same person value
120
  df_dict = df.to_dict(orient="list")
121
 
@@ -553,7 +549,19 @@ def anon_wrapper_func(
553
 
554
  return out_file_paths, out_message, key_string, log_files_output_paths
555
 
556
- def anonymise_script(df:pd.DataFrame, anon_strat:str, language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=[], in_deny_list:List[str]=[], max_fuzzy_spelling_mistakes_num:int=0, pii_identification_method:str="Local", chosen_redact_comprehend_entities:List[str]=[], comprehend_query_number:int=0, comprehend_client:botocore.client.BaseClient="", custom_entities=custom_entities, progress=Progress(track_tqdm=False)):
 
 
 
 
 
 
 
 
 
 
 
 
557
  '''
558
  Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
559
  '''
 
6
  import boto3
7
  import botocore
8
  import pandas as pd
9
+ from openpyxl import Workbook
 
10
  from faker import Faker
11
  from gradio import Progress
12
  from typing import List, Dict, Any
13
+ from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
15
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 
 
16
  from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, OUTPUT_FOLDER
17
  from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
18
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
 
19
  # Use custom version of analyze_dict to be able to track progress
20
  from tools.presidio_analyzer_custom import analyze_dict
21
 
 
24
  def fake_first_name(x):
25
  return fake.first_name()
26
 
27
+ def initial_clean(text:str) -> str:
28
  #### Some of my cleaning functions
29
  html_pattern_regex = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});|\xa0|&nbsp;'
30
  html_start_pattern_end_dots_regex = r'<(.*?)\.\.'
 
45
 
46
  return text
47
 
48
+ def process_recognizer_result(result:RecognizerResult, recognizer_result:RecognizerResult, data_row:int, dictionary_key:int, df_dict:Dict[str, List[Any]], keys_to_keep:List[str]) -> List[str]:
49
  output = []
50
 
51
  if hasattr(result, 'value'):
 
111
 
112
  return decision_process_output_str
113
 
114
+ def anon_consistent_names(df:pd.DataFrame) -> pd.DataFrame:
115
  # ## Pick out common names and replace them with the same person value
116
  df_dict = df.to_dict(orient="list")
117
 
 
549
 
550
  return out_file_paths, out_message, key_string, log_files_output_paths
551
 
552
+ def anonymise_script(df:pd.DataFrame,
553
+ anon_strat:str,
554
+ language:str,
555
+ chosen_redact_entities:List[str],
556
+ in_allow_list:List[str]=[],
557
+ in_deny_list:List[str]=[],
558
+ max_fuzzy_spelling_mistakes_num:int=0,
559
+ pii_identification_method:str="Local",
560
+ chosen_redact_comprehend_entities:List[str]=[],
561
+ comprehend_query_number:int=0,
562
+ comprehend_client:botocore.client.BaseClient="",
563
+ custom_entities:List[str]=custom_entities,
564
+ progress:Progress=Progress(track_tqdm=False)):
565
  '''
566
  Conduct anonymisation of a dataframe using Presidio and/or AWS Comprehend if chosen.
567
  '''
tools/file_conversion.py CHANGED
@@ -1,5 +1,4 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
-
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
@@ -14,7 +13,7 @@ import zipfile
14
  from collections import defaultdict
15
  from tqdm import tqdm
16
  from gradio import Progress
17
- from typing import List, Optional, Dict, Any
18
  from concurrent.futures import ThreadPoolExecutor, as_completed
19
  from pdf2image import convert_from_path
20
  from PIL import Image
@@ -23,14 +22,14 @@ import random
23
  import string
24
  import warnings # To warn about potential type changes
25
 
 
 
 
 
26
  IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
27
 
28
  pd.set_option('future.no_silent_downcasting', True)
29
 
30
- from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF
31
- from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
32
- # from tools.aws_textract import load_and_convert_textract_json
33
-
34
  image_dpi = float(IMAGES_DPI)
35
  if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
36
  else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
@@ -596,8 +595,8 @@ def prepare_image_or_pdf(
596
 
597
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
598
  # Check if the file is an image type and the user selected text ocr option
599
- if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
600
- in_redact_method = tesseract_ocr_option
601
 
602
  # Convert image to a pymupdf document
603
  pymupdf_doc = pymupdf.open() # Create a new empty document
@@ -765,13 +764,13 @@ def prepare_image_or_pdf(
765
 
766
  # Must be something else, return with error message
767
  else:
768
- if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
769
  if is_pdf_or_image(file_path) == False:
770
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
771
  print(out_message)
772
  raise Exception(out_message)
773
 
774
- elif in_redact_method == text_ocr_option:
775
  if is_pdf(file_path) == False:
776
  out_message = "Please upload a PDF file for text analysis."
777
  print(out_message)
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
 
2
  from PIL import Image, ImageFile
3
  import os
4
  import re
 
13
  from collections import defaultdict
14
  from tqdm import tqdm
15
  from gradio import Progress
16
+ from typing import List, Dict, Any
17
  from concurrent.futures import ThreadPoolExecutor, as_completed
18
  from pdf2image import convert_from_path
19
  from PIL import Image
 
22
  import string
23
  import warnings # To warn about potential type changes
24
 
25
+ from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION
26
+ from tools.helper_functions import get_file_name_without_type, read_file
27
+ # from tools.aws_textract import load_and_convert_textract_json
28
+
29
  IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
30
 
31
  pd.set_option('future.no_silent_downcasting', True)
32
 
 
 
 
 
33
  image_dpi = float(IMAGES_DPI)
34
  if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
35
  else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
 
595
 
596
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
597
  # Check if the file is an image type and the user selected text ocr option
598
+ if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == SELECTABLE_TEXT_EXTRACT_OPTION:
599
+ in_redact_method = TESSERACT_TEXT_EXTRACT_OPTION
600
 
601
  # Convert image to a pymupdf document
602
  pymupdf_doc = pymupdf.open() # Create a new empty document
 
764
 
765
  # Must be something else, return with error message
766
  else:
767
+ if in_redact_method == TESSERACT_TEXT_EXTRACT_OPTION or in_redact_method == TEXTRACT_TEXT_EXTRACT_OPTION:
768
  if is_pdf_or_image(file_path) == False:
769
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
770
  print(out_message)
771
  raise Exception(out_message)
772
 
773
+ elif in_redact_method == SELECTABLE_TEXT_EXTRACT_OPTION:
774
  if is_pdf(file_path) == False:
775
  out_message = "Please upload a PDF file for text analysis."
776
  print(out_message)
tools/file_redaction.py CHANGED
@@ -19,11 +19,11 @@ import gradio as gr
19
  from gradio import Progress
20
  from collections import defaultdict # For efficient grouping
21
 
22
- from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION
23
- from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes, recreate_page_line_level_ocr_results_with_page
24
- from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
25
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
26
- from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
27
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
28
 
29
  ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
@@ -242,7 +242,7 @@ def choose_and_run_redactor(file_paths:List[str],
242
  combined_out_message = combined_out_message + end_message
243
 
244
  # Only send across review file if redaction has been done
245
- if pii_identification_method != no_redaction_option:
246
 
247
  if len(review_out_file_paths) == 1:
248
  #review_file_path = [x for x in out_file_paths if "review_file" in x]
@@ -262,12 +262,12 @@ def choose_and_run_redactor(file_paths:List[str],
262
  # Prepare documents and images as required if they don't already exist
263
  prepare_images_flag = None # Determines whether to call prepare_image_or_pdf
264
 
265
- if textract_output_found and text_extraction_method == textract_option:
266
  print("Existing Textract outputs found, not preparing images or documents.")
267
  prepare_images_flag = False
268
  #return # No need to call `prepare_image_or_pdf`, exit early
269
 
270
- elif text_extraction_method == text_ocr_option:
271
  print("Running text extraction analysis, not preparing images.")
272
  prepare_images_flag = False
273
 
@@ -316,7 +316,7 @@ def choose_and_run_redactor(file_paths:List[str],
316
  combined_out_message = combined_out_message + "\n" + out_message
317
 
318
  # Only send across review file if redaction has been done
319
- if pii_identification_method != no_redaction_option:
320
  # If only pdf currently in review outputs, add on the latest review file
321
  if len(review_out_file_paths) == 1:
322
  #review_file_path = [x for x in out_file_paths if "review_file" in x]
@@ -361,7 +361,7 @@ def choose_and_run_redactor(file_paths:List[str],
361
 
362
 
363
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
364
- if pii_identification_method == aws_pii_detector:
365
  if aws_access_key_textbox and aws_secret_key_textbox:
366
  print("Connecting to Comprehend using AWS access key and secret keys from user input.")
367
  comprehend_client = boto3.client('comprehend',
@@ -384,7 +384,7 @@ def choose_and_run_redactor(file_paths:List[str],
384
  comprehend_client = ""
385
 
386
  # Try to connect to AWS Textract Client if using that text extraction method
387
- if text_extraction_method == textract_option:
388
  if aws_access_key_textbox and aws_secret_key_textbox:
389
  print("Connecting to Textract using AWS access key and secret keys from user input.")
390
  textract_client = boto3.client('textract',
@@ -429,10 +429,10 @@ def choose_and_run_redactor(file_paths:List[str],
429
  pdf_file_name_with_ext = os.path.basename(file_path)
430
 
431
  is_a_pdf = is_pdf(file_path) == True
432
- if is_a_pdf == False and text_extraction_method == text_ocr_option:
433
  # If user has not submitted a pdf, assume it's an image
434
  print("File is not a PDF, assuming that image analysis needs to be used.")
435
- text_extraction_method = tesseract_ocr_option
436
  else:
437
  out_message = "No file selected"
438
  print(out_message)
@@ -443,7 +443,7 @@ def choose_and_run_redactor(file_paths:List[str],
443
  review_file_path = orig_pdf_file_path + '_review_file.csv'
444
 
445
  # Remove any existing review_file paths from the review file outputs
446
- if text_extraction_method == tesseract_ocr_option or text_extraction_method == textract_option:
447
 
448
  #Analyse and redact image-based pdf or image
449
  if is_pdf_or_image(file_path) == False:
@@ -490,7 +490,7 @@ def choose_and_run_redactor(file_paths:List[str],
490
  all_textract_request_metadata.extend(new_textract_request_metadata)
491
 
492
 
493
- elif text_extraction_method == text_ocr_option:
494
 
495
  if is_pdf(file_path) == False:
496
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
@@ -541,7 +541,7 @@ def choose_and_run_redactor(file_paths:List[str],
541
 
542
 
543
  # Save redacted file
544
- if pii_identification_method != no_redaction_option:
545
  if RETURN_PDF_END_OF_REDACTION == True:
546
  progress(0.9, "Saving redacted file")
547
 
@@ -589,7 +589,7 @@ def choose_and_run_redactor(file_paths:List[str],
589
 
590
  review_file_state.to_csv(review_file_path, index=None)
591
 
592
- if pii_identification_method != no_redaction_option:
593
  out_file_paths.append(review_file_path)
594
 
595
  # Make a combined message for the file
@@ -1249,7 +1249,7 @@ def redact_image_pdf(file_path:str,
1249
  allow_list:List[str]=None,
1250
  page_min:int=0,
1251
  page_max:int=999,
1252
- text_extraction_method:str=tesseract_ocr_option,
1253
  handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
1254
  textract_request_metadata:list=[],
1255
  current_loop_page:int=0,
@@ -1287,7 +1287,7 @@ def redact_image_pdf(file_path:str,
1287
  - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
1288
  - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
1289
  - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
1290
- - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to tesseract_ocr_option.
1291
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
1292
  - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
1293
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
@@ -1336,7 +1336,7 @@ def redact_image_pdf(file_path:str,
1336
  print(out_message)
1337
  raise Exception(out_message)
1338
 
1339
- if text_extraction_method == textract_option and textract_client == "":
1340
  out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
1341
  print(out_message_warning)
1342
  #raise Exception(out_message)
@@ -1353,7 +1353,7 @@ def redact_image_pdf(file_path:str,
1353
  print("Page range:", str(page_min + 1), "to", str(page_max))
1354
 
1355
  # If running Textract, check if file already exists. If it does, load in existing data
1356
- if text_extraction_method == textract_option:
1357
  textract_json_file_path = output_folder + file_name + "_textract.json"
1358
  textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
1359
  original_textract_data = textract_data.copy()
@@ -1361,7 +1361,7 @@ def redact_image_pdf(file_path:str,
1361
  print("Successfully loaded in Textract analysis results from file")
1362
 
1363
  # If running local OCR option, check if file already exists. If it does, load in existing data
1364
- if text_extraction_method == tesseract_ocr_option:
1365
  all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
1366
  all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
1367
  original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
@@ -1428,7 +1428,7 @@ def redact_image_pdf(file_path:str,
1428
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
1429
 
1430
  # If using Tesseract
1431
- if text_extraction_method == tesseract_ocr_option:
1432
 
1433
  if all_page_line_level_ocr_results_with_words:
1434
  # Find the first dict where 'page' matches
@@ -1452,7 +1452,7 @@ def redact_image_pdf(file_path:str,
1452
  all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
1453
 
1454
  # Check if page exists in existing textract data. If not, send to service to analyse
1455
- if text_extraction_method == textract_option:
1456
  text_blocks = []
1457
 
1458
  if not textract_data:
@@ -1527,7 +1527,7 @@ def redact_image_pdf(file_path:str,
1527
 
1528
  all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
1529
 
1530
- if pii_identification_method != no_redaction_option:
1531
  # Step 2: Analyse text and identify PII
1532
  if chosen_redact_entities or chosen_redact_comprehend_entities:
1533
 
@@ -1667,7 +1667,7 @@ def redact_image_pdf(file_path:str,
1667
  annotations_all_pages.append(page_image_annotations)
1668
 
1669
 
1670
- if text_extraction_method == textract_option:
1671
  if original_textract_data != textract_data:
1672
  # Write the updated existing textract data back to the JSON file
1673
  with open(textract_json_file_path, 'w') as json_file:
@@ -1676,7 +1676,7 @@ def redact_image_pdf(file_path:str,
1676
  if textract_json_file_path not in log_files_output_paths:
1677
  log_files_output_paths.append(textract_json_file_path)
1678
 
1679
- if text_extraction_method == tesseract_ocr_option:
1680
  if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1681
  # Write the updated existing textract data back to the JSON file
1682
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
@@ -1715,7 +1715,7 @@ def redact_image_pdf(file_path:str,
1715
  progress.close(_tqdm=progress_bar)
1716
  tqdm._instances.clear()
1717
 
1718
- if text_extraction_method == textract_option:
1719
  # Write the updated existing textract data back to the JSON file
1720
  if original_textract_data != textract_data:
1721
  with open(textract_json_file_path, 'w') as json_file:
@@ -1724,7 +1724,7 @@ def redact_image_pdf(file_path:str,
1724
  if textract_json_file_path not in log_files_output_paths:
1725
  log_files_output_paths.append(textract_json_file_path)
1726
 
1727
- if text_extraction_method == tesseract_ocr_option:
1728
  if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1729
  # Write the updated existing textract data back to the JSON file
1730
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
@@ -1739,7 +1739,7 @@ def redact_image_pdf(file_path:str,
1739
 
1740
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1741
 
1742
- if text_extraction_method == textract_option:
1743
  # Write the updated existing textract data back to the JSON file
1744
 
1745
  if original_textract_data != textract_data:
@@ -1749,7 +1749,7 @@ def redact_image_pdf(file_path:str,
1749
  if textract_json_file_path not in log_files_output_paths:
1750
  log_files_output_paths.append(textract_json_file_path)
1751
 
1752
- if text_extraction_method == tesseract_ocr_option:
1753
  if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1754
  # Write the updated existing textract data back to the JSON file
1755
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
@@ -2095,7 +2095,7 @@ def redact_text_pdf(
2095
  all_page_line_text_extraction_characters.extend(line_characters)
2096
 
2097
  ### REDACTION
2098
- if pii_identification_method != no_redaction_option:
2099
 
2100
  if chosen_redact_entities or chosen_redact_comprehend_entities:
2101
  page_redaction_bounding_boxes = run_page_text_redaction(
 
19
  from gradio import Progress
20
  from collections import defaultdict # For efficient grouping
21
 
22
+ from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION, TESSERACT_TEXT_EXTRACT_OPTION, SELECTABLE_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, LOCAL_PII_OPTION, AWS_PII_OPTION, NO_REDACTION_PII_OPTION
23
+ from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, recreate_page_line_level_ocr_results_with_page
24
+ from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
25
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
26
+ from tools.helper_functions import get_file_name_without_type, clean_unicode_text
27
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
28
 
29
  ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
 
242
  combined_out_message = combined_out_message + end_message
243
 
244
  # Only send across review file if redaction has been done
245
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
246
 
247
  if len(review_out_file_paths) == 1:
248
  #review_file_path = [x for x in out_file_paths if "review_file" in x]
 
262
  # Prepare documents and images as required if they don't already exist
263
  prepare_images_flag = None # Determines whether to call prepare_image_or_pdf
264
 
265
+ if textract_output_found and text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
266
  print("Existing Textract outputs found, not preparing images or documents.")
267
  prepare_images_flag = False
268
  #return # No need to call `prepare_image_or_pdf`, exit early
269
 
270
+ elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
271
  print("Running text extraction analysis, not preparing images.")
272
  prepare_images_flag = False
273
 
 
316
  combined_out_message = combined_out_message + "\n" + out_message
317
 
318
  # Only send across review file if redaction has been done
319
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
320
  # If only pdf currently in review outputs, add on the latest review file
321
  if len(review_out_file_paths) == 1:
322
  #review_file_path = [x for x in out_file_paths if "review_file" in x]
 
361
 
362
 
363
  # Try to connect to AWS services directly only if RUN_AWS_FUNCTIONS environmental variable is 1, otherwise an environment variable or direct textbox input is needed.
364
+ if pii_identification_method == AWS_PII_OPTION:
365
  if aws_access_key_textbox and aws_secret_key_textbox:
366
  print("Connecting to Comprehend using AWS access key and secret keys from user input.")
367
  comprehend_client = boto3.client('comprehend',
 
384
  comprehend_client = ""
385
 
386
  # Try to connect to AWS Textract Client if using that text extraction method
387
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
388
  if aws_access_key_textbox and aws_secret_key_textbox:
389
  print("Connecting to Textract using AWS access key and secret keys from user input.")
390
  textract_client = boto3.client('textract',
 
429
  pdf_file_name_with_ext = os.path.basename(file_path)
430
 
431
  is_a_pdf = is_pdf(file_path) == True
432
+ if is_a_pdf == False and text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
433
  # If user has not submitted a pdf, assume it's an image
434
  print("File is not a PDF, assuming that image analysis needs to be used.")
435
+ text_extraction_method = TESSERACT_TEXT_EXTRACT_OPTION
436
  else:
437
  out_message = "No file selected"
438
  print(out_message)
 
443
  review_file_path = orig_pdf_file_path + '_review_file.csv'
444
 
445
  # Remove any existing review_file paths from the review file outputs
446
+ if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
447
 
448
  #Analyse and redact image-based pdf or image
449
  if is_pdf_or_image(file_path) == False:
 
490
  all_textract_request_metadata.extend(new_textract_request_metadata)
491
 
492
 
493
+ elif text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION:
494
 
495
  if is_pdf(file_path) == False:
496
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
 
541
 
542
 
543
  # Save redacted file
544
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
545
  if RETURN_PDF_END_OF_REDACTION == True:
546
  progress(0.9, "Saving redacted file")
547
 
 
589
 
590
  review_file_state.to_csv(review_file_path, index=None)
591
 
592
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
593
  out_file_paths.append(review_file_path)
594
 
595
  # Make a combined message for the file
 
1249
  allow_list:List[str]=None,
1250
  page_min:int=0,
1251
  page_max:int=999,
1252
+ text_extraction_method:str=TESSERACT_TEXT_EXTRACT_OPTION,
1253
  handwrite_signature_checkbox:List[str]=["Extract handwriting", "Extract signatures"],
1254
  textract_request_metadata:list=[],
1255
  current_loop_page:int=0,
 
1287
  - allow_list (List[str], optional): A list of entity types to allow in the PDF. Defaults to None.
1288
  - page_min (int, optional): The minimum page number to start redaction from. Defaults to 0.
1289
  - page_max (int, optional): The maximum page number to end redaction at. Defaults to 999.
1290
+ - text_extraction_method (str, optional): The type of analysis to perform on the PDF. Defaults to TESSERACT_TEXT_EXTRACT_OPTION.
1291
  - handwrite_signature_checkbox (List[str], optional): A list of options for redacting handwriting and signatures. Defaults to ["Extract handwriting", "Extract signatures"].
1292
  - textract_request_metadata (list, optional): Metadata related to the redaction request. Defaults to an empty string.
1293
  - page_break_return (bool, optional): Indicates if the function should return after a page break. Defaults to False.
 
1336
  print(out_message)
1337
  raise Exception(out_message)
1338
 
1339
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION and textract_client == "":
1340
  out_message_warning = "Connection to AWS Textract service unsuccessful. Redaction will only continue if local AWS Textract results can be found."
1341
  print(out_message_warning)
1342
  #raise Exception(out_message)
 
1353
  print("Page range:", str(page_min + 1), "to", str(page_max))
1354
 
1355
  # If running Textract, check if file already exists. If it does, load in existing data
1356
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1357
  textract_json_file_path = output_folder + file_name + "_textract.json"
1358
  textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths, page_sizes_df)
1359
  original_textract_data = textract_data.copy()
 
1361
  print("Successfully loaded in Textract analysis results from file")
1362
 
1363
  # If running local OCR option, check if file already exists. If it does, load in existing data
1364
+ if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1365
  all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
1366
  all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
1367
  original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
 
1428
  # Step 1: Perform OCR. Either with Tesseract, or with AWS Textract
1429
 
1430
  # If using Tesseract
1431
+ if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1432
 
1433
  if all_page_line_level_ocr_results_with_words:
1434
  # Find the first dict where 'page' matches
 
1452
  all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
1453
 
1454
  # Check if page exists in existing textract data. If not, send to service to analyse
1455
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1456
  text_blocks = []
1457
 
1458
  if not textract_data:
 
1527
 
1528
  all_line_level_ocr_results_df_list.append(line_level_ocr_results_df)
1529
 
1530
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
1531
  # Step 2: Analyse text and identify PII
1532
  if chosen_redact_entities or chosen_redact_comprehend_entities:
1533
 
 
1667
  annotations_all_pages.append(page_image_annotations)
1668
 
1669
 
1670
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1671
  if original_textract_data != textract_data:
1672
  # Write the updated existing textract data back to the JSON file
1673
  with open(textract_json_file_path, 'w') as json_file:
 
1676
  if textract_json_file_path not in log_files_output_paths:
1677
  log_files_output_paths.append(textract_json_file_path)
1678
 
1679
+ if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1680
  if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1681
  # Write the updated existing textract data back to the JSON file
1682
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
 
1715
  progress.close(_tqdm=progress_bar)
1716
  tqdm._instances.clear()
1717
 
1718
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1719
  # Write the updated existing textract data back to the JSON file
1720
  if original_textract_data != textract_data:
1721
  with open(textract_json_file_path, 'w') as json_file:
 
1724
  if textract_json_file_path not in log_files_output_paths:
1725
  log_files_output_paths.append(textract_json_file_path)
1726
 
1727
+ if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1728
  if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1729
  # Write the updated existing textract data back to the JSON file
1730
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
 
1739
 
1740
  return pymupdf_doc, all_pages_decision_process_table, log_files_output_paths, textract_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
1741
 
1742
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
1743
  # Write the updated existing textract data back to the JSON file
1744
 
1745
  if original_textract_data != textract_data:
 
1749
  if textract_json_file_path not in log_files_output_paths:
1750
  log_files_output_paths.append(textract_json_file_path)
1751
 
1752
+ if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1753
  if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1754
  # Write the updated existing textract data back to the JSON file
1755
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
 
2095
  all_page_line_text_extraction_characters.extend(line_characters)
2096
 
2097
  ### REDACTION
2098
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
2099
 
2100
  if chosen_redact_entities or chosen_redact_comprehend_entities:
2101
  page_redaction_bounding_boxes = run_page_text_redaction(
tools/helper_functions.py CHANGED
@@ -9,16 +9,7 @@ import unicodedata
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
- from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC
13
-
14
- # Names for options labels
15
- text_ocr_option = "Local model - selectable text"
16
- tesseract_ocr_option = "Local OCR model - PDFs without selectable text"
17
- textract_option = "AWS Textract service - all PDF types"
18
-
19
- no_redaction_option = "Only extract text (no redaction)"
20
- local_pii_detector = "Local"
21
- aws_pii_detector = "AWS Comprehend"
22
 
23
  def reset_state_vars():
24
  return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
@@ -438,9 +429,9 @@ def calculate_aws_costs(number_of_pages:str,
438
  comprehend_unit_cost:float=0.0001,
439
  comprehend_size_unit_average:float=250,
440
  average_characters_per_page:float=2000,
441
- textract_option:str=textract_option,
442
- no_redaction_option:str=no_redaction_option,
443
- aws_pii_detector:str=aws_pii_detector):
444
  '''
445
  Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
446
 
@@ -457,9 +448,9 @@ def calculate_aws_costs(number_of_pages:str,
457
  - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
458
  - comprehend_size_unit_average (float, optional): Average size of a 'unit' of text passed to AWS Comprehend by the app through the batching process
459
  - average_characters_per_page (float, optional): Average number of characters on an A4 page.
460
- - textract_option (str, optional): String label for the text_extract_method_radio button for AWS Textract.
461
- - no_redaction_option (str, optional): String label for pii_identification_method_drop for no redaction.
462
- - aws_pii_detector (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
463
  '''
464
  text_extraction_cost = 0
465
  pii_identification_cost = 0
@@ -467,14 +458,14 @@ def calculate_aws_costs(number_of_pages:str,
467
  number_of_pages = int(number_of_pages)
468
 
469
  if textract_output_found_checkbox != True:
470
- if text_extract_method_radio == textract_option:
471
  text_extraction_cost = number_of_pages * textract_page_cost
472
 
473
  if "Extract signatures" in handwrite_signature_checkbox:
474
  text_extraction_cost += (textract_signature_cost * number_of_pages)
475
 
476
- if pii_identification_method != no_redaction_option:
477
- if pii_identification_method == aws_pii_detector:
478
  comprehend_page_cost = ceil(average_characters_per_page / comprehend_size_unit_average) * comprehend_unit_cost
479
  pii_identification_cost = comprehend_page_cost * number_of_pages
480
 
@@ -497,11 +488,11 @@ def calculate_time_taken(number_of_pages:str,
497
  local_text_extraction_page_time:float=0.3,
498
  local_pii_redaction_page_time:float=0.5,
499
  local_ocr_extraction_page_time:float=1.5,
500
- textract_option:str=textract_option,
501
- text_ocr_option:str=text_ocr_option,
502
- local_ocr_option:str=tesseract_ocr_option,
503
- no_redaction_option:str=no_redaction_option,
504
- aws_pii_detector:str=aws_pii_detector):
505
  '''
506
  Calculate the approximate time to redact a document.
507
 
@@ -516,11 +507,11 @@ def calculate_time_taken(number_of_pages:str,
516
  - local_text_redaction_page_time (float, optional): Approximate time to extract text on a page with the local text redaction option.
517
  - local_pii_redaction_page_time (float, optional): Approximate time to redact text on a page with the local text redaction option.
518
  - local_ocr_extraction_page_time (float, optional): Approximate time to extract text from a page with the local OCR redaction option.
519
- - textract_option (str, optional): String label for the text_extract_method_radio button for AWS Textract.
520
- - text_ocr_option (str, optional): String label for text_extract_method_radio for text extraction.
521
  - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
522
- - no_redaction_option (str, optional): String label for pii_identification_method_drop for no redaction.
523
- - aws_pii_detector (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
524
  '''
525
  calculated_time_taken = 0
526
  page_conversion_time_taken = 0
@@ -530,22 +521,22 @@ def calculate_time_taken(number_of_pages:str,
530
  number_of_pages = int(number_of_pages)
531
 
532
  # Page preparation/conversion to image time
533
- if (text_extract_method_radio != text_ocr_option) and (textract_output_found_checkbox != True):
534
  page_conversion_time_taken = number_of_pages * convert_page_time
535
 
536
  # Page text extraction time
537
- if text_extract_method_radio == textract_option:
538
  if textract_output_found_checkbox != True:
539
  page_extraction_time_taken = number_of_pages * textract_page_time
540
  elif text_extract_method_radio == local_ocr_option:
541
  if local_ocr_output_found_checkbox != True:
542
  page_extraction_time_taken = number_of_pages * local_ocr_extraction_page_time
543
- elif text_extract_method_radio == text_ocr_option:
544
  page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
545
 
546
  # Page redaction time
547
- if pii_identification_method != no_redaction_option:
548
- if pii_identification_method == aws_pii_detector:
549
  page_redaction_time_taken = number_of_pages * comprehend_page_time
550
  else:
551
  page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time
 
9
  from typing import List
10
  from math import ceil
11
  from gradio_image_annotation import image_annotator
12
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, OUTPUT_FOLDER, INPUT_FOLDER, SESSION_OUTPUT_FOLDER, AWS_USER_POOL_ID, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_INPUT_SUBFOLDER, TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_OUTPUT_SUBFOLDER, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, SELECTABLE_TEXT_EXTRACT_OPTION, TESSERACT_TEXT_EXTRACT_OPTION, TEXTRACT_TEXT_EXTRACT_OPTION, NO_REDACTION_PII_OPTION, AWS_PII_OPTION
 
 
 
 
 
 
 
 
 
13
 
14
  def reset_state_vars():
15
  return [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
 
429
  comprehend_unit_cost:float=0.0001,
430
  comprehend_size_unit_average:float=250,
431
  average_characters_per_page:float=2000,
432
+ TEXTRACT_TEXT_EXTRACT_OPTION:str=TEXTRACT_TEXT_EXTRACT_OPTION,
433
+ NO_REDACTION_PII_OPTION:str=NO_REDACTION_PII_OPTION,
434
+ AWS_PII_OPTION:str=AWS_PII_OPTION):
435
  '''
436
  Calculate the approximate cost of submitting a document to AWS Textract and/or AWS Comprehend, assuming that Textract outputs do not already exist in the output folder.
437
 
 
448
  - comprehend_unit_cost (float, optional): Cost per 'unit' (300 character minimum) for identifying PII in text with AWS Comprehend.
449
  - comprehend_size_unit_average (float, optional): Average size of a 'unit' of text passed to AWS Comprehend by the app through the batching process
450
  - average_characters_per_page (float, optional): Average number of characters on an A4 page.
451
+ - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
452
+ - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
453
+ - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
454
  '''
455
  text_extraction_cost = 0
456
  pii_identification_cost = 0
 
458
  number_of_pages = int(number_of_pages)
459
 
460
  if textract_output_found_checkbox != True:
461
+ if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
462
  text_extraction_cost = number_of_pages * textract_page_cost
463
 
464
  if "Extract signatures" in handwrite_signature_checkbox:
465
  text_extraction_cost += (textract_signature_cost * number_of_pages)
466
 
467
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
468
+ if pii_identification_method == AWS_PII_OPTION:
469
  comprehend_page_cost = ceil(average_characters_per_page / comprehend_size_unit_average) * comprehend_unit_cost
470
  pii_identification_cost = comprehend_page_cost * number_of_pages
471
 
 
488
  local_text_extraction_page_time:float=0.3,
489
  local_pii_redaction_page_time:float=0.5,
490
  local_ocr_extraction_page_time:float=1.5,
491
+ TEXTRACT_TEXT_EXTRACT_OPTION:str=TEXTRACT_TEXT_EXTRACT_OPTION,
492
+ SELECTABLE_TEXT_EXTRACT_OPTION:str=SELECTABLE_TEXT_EXTRACT_OPTION,
493
+ local_ocr_option:str=TESSERACT_TEXT_EXTRACT_OPTION,
494
+ NO_REDACTION_PII_OPTION:str=NO_REDACTION_PII_OPTION,
495
+ AWS_PII_OPTION:str=AWS_PII_OPTION):
496
  '''
497
  Calculate the approximate time to redact a document.
498
 
 
507
  - local_text_redaction_page_time (float, optional): Approximate time to extract text on a page with the local text redaction option.
508
  - local_pii_redaction_page_time (float, optional): Approximate time to redact text on a page with the local text redaction option.
509
  - local_ocr_extraction_page_time (float, optional): Approximate time to extract text from a page with the local OCR redaction option.
510
+ - TEXTRACT_TEXT_EXTRACT_OPTION (str, optional): String label for the text_extract_method_radio button for AWS Textract.
511
+ - SELECTABLE_TEXT_EXTRACT_OPTION (str, optional): String label for text_extract_method_radio for text extraction.
512
  - local_ocr_option (str, optional): String label for text_extract_method_radio for local OCR.
513
+ - NO_REDACTION_PII_OPTION (str, optional): String label for pii_identification_method_drop for no redaction.
514
+ - AWS_PII_OPTION (str, optional): String label for pii_identification_method_drop for AWS Comprehend.
515
  '''
516
  calculated_time_taken = 0
517
  page_conversion_time_taken = 0
 
521
  number_of_pages = int(number_of_pages)
522
 
523
  # Page preparation/conversion to image time
524
+ if (text_extract_method_radio != SELECTABLE_TEXT_EXTRACT_OPTION) and (textract_output_found_checkbox != True):
525
  page_conversion_time_taken = number_of_pages * convert_page_time
526
 
527
  # Page text extraction time
528
+ if text_extract_method_radio == TEXTRACT_TEXT_EXTRACT_OPTION:
529
  if textract_output_found_checkbox != True:
530
  page_extraction_time_taken = number_of_pages * textract_page_time
531
  elif text_extract_method_radio == local_ocr_option:
532
  if local_ocr_output_found_checkbox != True:
533
  page_extraction_time_taken = number_of_pages * local_ocr_extraction_page_time
534
+ elif text_extract_method_radio == SELECTABLE_TEXT_EXTRACT_OPTION:
535
  page_conversion_time_taken = number_of_pages * local_text_extraction_page_time
536
 
537
  # Page redaction time
538
+ if pii_identification_method != NO_REDACTION_PII_OPTION:
539
+ if pii_identification_method == AWS_PII_OPTION:
540
  page_redaction_time_taken = number_of_pages * comprehend_page_time
541
  else:
542
  page_redaction_time_taken = number_of_pages * local_pii_redaction_page_time
tools/redaction_review.py CHANGED
@@ -14,8 +14,8 @@ import pymupdf
14
  from PIL import ImageDraw, Image
15
  from datetime import datetime, timezone, timedelta
16
 
17
- from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
18
- from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
19
  from tools.helper_functions import get_file_name_without_type, detect_file_type
20
  from tools.file_redaction import redact_page_with_pymupdf
21
 
 
14
  from PIL import ImageDraw, Image
15
  from datetime import datetime, timezone, timedelta
16
 
17
+ from tools.config import OUTPUT_FOLDER, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
18
+ from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
19
  from tools.helper_functions import get_file_name_without_type, detect_file_type
20
  from tools.file_redaction import redact_page_with_pymupdf
21