Sean Pedrick-Case commited on
Commit
d84e0a9
·
unverified ·
2 Parent(s): 8b4217f b397d1d

Merge pull request #6 from seanpedrick-case/dev

Browse files

Export to Adobe, fuzzy matching, and duplicate page identification

Dockerfile CHANGED
@@ -60,6 +60,9 @@ RUN mkdir -p /home/user/app/output \
60
  # Copy installed packages from builder stage
61
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
62
 
 
 
 
63
  # Entrypoint helps to switch between Gradio and Lambda mode
64
  COPY entrypoint.sh /entrypoint.sh
65
 
 
60
  # Copy installed packages from builder stage
61
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
62
 
63
+ # Download NLTK data packages
64
+ RUN python -m nltk.downloader punkt stopwords punkt_tab
65
+
66
  # Entrypoint helps to switch between Gradio and Lambda mode
67
  COPY entrypoint.sh /entrypoint.sh
68
 
app.py CHANGED
@@ -10,15 +10,16 @@ from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
- from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
20
  from tools.load_spacy_model_custom_recognisers import custom_entities
21
  from tools.custom_csvlogger import CSVLogger_custom
 
22
 
23
  today_rev = datetime.now().strftime("%Y%m%d")
24
 
@@ -29,15 +30,16 @@ ensure_output_folder_exists()
29
 
30
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
31
 
32
- full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
33
 
34
  # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
35
  chosen_comprehend_entities.extend(custom_entities)
36
  full_comprehend_entity_list.extend(custom_entities)
37
 
 
38
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
39
 
40
- full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
41
 
42
  language = 'en'
43
 
@@ -67,10 +69,9 @@ with app:
67
  pdf_doc_state = gr.State([])
68
  all_image_annotations_state = gr.State([])
69
 
70
-
71
- all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
72
- all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
73
- review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
74
 
75
  session_hash_state = gr.State()
76
  s3_output_folder_state = gr.State()
@@ -129,16 +130,16 @@ with app:
129
  ## Settings page variables
130
  default_allow_list_file_name = "default_allow_list.csv"
131
  default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
132
- in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
133
 
134
  default_deny_list_file_name = "default_deny_list.csv"
135
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
136
- in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
137
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
138
 
139
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
140
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
141
- in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
142
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
143
 
144
  # S3 settings for default allow list load
@@ -149,6 +150,12 @@ with app:
149
  # Base dataframe for recognisers that is not modified subsequent to load
150
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
151
 
 
 
 
 
 
 
152
  ###
153
  # UI DESIGN
154
  ###
@@ -164,8 +171,10 @@ with app:
164
 
165
  NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
166
 
167
- # PDF / IMAGES TAB
168
- with gr.Tab("PDFs/images"):
 
 
169
  with gr.Accordion("Redact document", open = True):
170
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
171
  if RUN_AWS_FUNCTIONS == "1":
@@ -194,7 +203,9 @@ with app:
194
  pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
195
  pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
196
 
197
- # Object annotation
 
 
198
  with gr.Tab("Review redactions", id="tab_object_annotation"):
199
 
200
  with gr.Accordion(label = "Review redaction file", open=True):
@@ -215,7 +226,6 @@ with app:
215
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
216
 
217
  with gr.Row():
218
-
219
  with gr.Column(scale=1):
220
 
221
  zoom_str = str(annotator_zoom_number) + '%'
@@ -247,10 +257,16 @@ with app:
247
  #with gr.Column(scale=1):
248
  with gr.Row():
249
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
250
- recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
251
 
252
-
 
 
 
 
 
253
  # TEXT / TABULAR DATA TAB
 
254
  with gr.Tab(label="Open text or Excel/csv files"):
255
  gr.Markdown(
256
  """
@@ -280,7 +296,20 @@ with app:
280
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
281
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
282
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  # SETTINGS TAB
 
284
  with gr.Tab(label="Redaction settings"):
285
  with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
286
  with gr.Row():
@@ -296,9 +325,12 @@ with app:
296
 
297
  with gr.Accordion("Select entity types to redact", open = True):
298
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
299
-
300
  in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
301
 
 
 
 
 
302
  with gr.Accordion("Redact only selected pages", open = False):
303
  with gr.Row():
304
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
@@ -312,21 +344,30 @@ with app:
312
  with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
313
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
314
 
315
- log_files_output = gr.File(label="Log file output", interactive=False)
 
 
 
 
 
 
 
 
 
316
 
317
  ###
318
  # PDF/IMAGE REDACTION
319
  ###
320
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
321
 
322
- document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
323
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
324
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
325
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
326
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
327
 
328
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
329
- current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
330
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
331
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
332
 
@@ -339,7 +380,8 @@ with app:
339
  ###
340
 
341
  # Upload previous files for modifying redactions
342
- upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
 
343
  then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
344
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
345
 
@@ -397,7 +439,16 @@ with app:
397
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
398
  then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
399
 
400
-
 
 
 
 
 
 
 
 
 
401
  ###
402
  # TABULAR DATA REDACTION
403
  ###
@@ -410,13 +461,22 @@ with app:
410
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
411
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
412
 
 
 
 
 
 
413
  ###
414
  # SETTINGS PAGE INPUT / OUTPUT
415
  ###
416
- # If a custom allow list is uploaded
417
  in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
418
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
419
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
 
 
 
 
420
 
421
 
422
  ###
 
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
  from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
+ from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
20
  from tools.load_spacy_model_custom_recognisers import custom_entities
21
  from tools.custom_csvlogger import CSVLogger_custom
22
+ from tools.find_duplicate_pages import identify_similar_pages
23
 
24
  today_rev = datetime.now().strftime("%Y%m%d")
25
 
 
30
 
31
  chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
32
 
33
+ full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
34
 
35
  # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
36
  chosen_comprehend_entities.extend(custom_entities)
37
  full_comprehend_entity_list.extend(custom_entities)
38
 
39
+ # Entities for local PII redaction option
40
  chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
41
 
42
+ full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
43
 
44
  language = 'en'
45
 
 
69
  pdf_doc_state = gr.State([])
70
  all_image_annotations_state = gr.State([])
71
 
72
+ all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
73
+ all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
74
+ review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
 
75
 
76
  session_hash_state = gr.State()
77
  s3_output_folder_state = gr.State()
 
130
  ## Settings page variables
131
  default_allow_list_file_name = "default_allow_list.csv"
132
  default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
133
+ in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=False, type="pandas")
134
 
135
  default_deny_list_file_name = "default_deny_list.csv"
136
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
137
+ in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
138
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
139
 
140
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
141
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
142
+ in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=False, type="pandas")
143
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
144
 
145
  # S3 settings for default allow list load
 
150
  # Base dataframe for recognisers that is not modified subsequent to load
151
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
152
 
153
+ # Duplicate page detection
154
+ in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
155
+ duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
156
+
157
+
158
+
159
  ###
160
  # UI DESIGN
161
  ###
 
171
 
172
  NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
173
 
174
+ ###
175
+ # REDACTION PDF/IMAGES TABL
176
+ ###
177
+ with gr.Tab("Redact PDFs/images"):
178
  with gr.Accordion("Redact document", open = True):
179
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
180
  if RUN_AWS_FUNCTIONS == "1":
 
203
  pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
204
  pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
205
 
206
+ ###
207
+ # REVIEW REDACTIONS TAB
208
+ ###
209
  with gr.Tab("Review redactions", id="tab_object_annotation"):
210
 
211
  with gr.Accordion(label = "Review redaction file", open=True):
 
226
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
227
 
228
  with gr.Row():
 
229
  with gr.Column(scale=1):
230
 
231
  zoom_str = str(annotator_zoom_number) + '%'
 
257
  #with gr.Column(scale=1):
258
  with gr.Row():
259
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
260
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
261
 
262
+ with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
263
+ convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
264
+ adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
265
+ convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
266
+
267
+ ###
268
  # TEXT / TABULAR DATA TAB
269
+ ###
270
  with gr.Tab(label="Open text or Excel/csv files"):
271
  gr.Markdown(
272
  """
 
296
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
297
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
298
 
299
+ ###
300
+ # IDENTIFY DUPLICATE PAGES TAB
301
+ ###
302
+ with gr.Tab(label="Identify duplicate pages"):
303
+ with gr.Accordion("Identify duplicate pages to redact", open = True):
304
+ in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
305
+
306
+ find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
307
+
308
+ duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
309
+
310
+ ###
311
  # SETTINGS TAB
312
+ ###
313
  with gr.Tab(label="Redaction settings"):
314
  with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
315
  with gr.Row():
 
325
 
326
  with gr.Accordion("Select entity types to redact", open = True):
327
  in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
 
328
  in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
329
 
330
+ with gr.Row():
331
+ max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
332
+ match_fuzzy_whole_phrase_bool = gr.Checkbox(label="Should fuzzy match on entire phrases in deny list (as opposed to each word individually)?", value=True)
333
+
334
  with gr.Accordion("Redact only selected pages", open = False):
335
  with gr.Row():
336
  page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
 
344
  with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
345
  anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
346
 
347
+ log_files_output = gr.File(label="Log file output", interactive=False)
348
+
349
+ with gr.Accordion("Combine multiple review files", open = False):
350
+ multiple_review_files_in_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv'])
351
+ merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
352
+
353
+
354
+
355
+
356
+ ### UI INTERACTION ###
357
 
358
  ###
359
  # PDF/IMAGE REDACTION
360
  ###
361
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
362
 
363
+ document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
364
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
365
+ then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
366
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
367
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
368
 
369
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
370
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
371
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
372
  then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
373
 
 
380
  ###
381
 
382
  # Upload previous files for modifying redactions
383
+ upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
384
+ then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
385
  then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
386
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
387
 
 
439
  then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
440
  then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
441
 
442
+ # Convert review file to xfdf Adobe format
443
+ convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
444
+ then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
445
+ then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
446
+
447
+ # Convert xfdf Adobe file back to review_file.csv
448
+ convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
449
+ then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
450
+ then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
451
+
452
  ###
453
  # TABULAR DATA REDACTION
454
  ###
 
461
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
462
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
463
 
464
+ ###
465
+ # IDENTIFY DUPLICATE PAGES
466
+ ###
467
+ find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages], outputs=[duplicate_pages_df, duplicate_pages_out])
468
+
469
  ###
470
  # SETTINGS PAGE INPUT / OUTPUT
471
  ###
472
+ # If a custom allow/deny/duplicate page list is uploaded
473
  in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
474
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
475
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
476
+
477
+
478
+ # Merge multiple review csv files together
479
+ merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
480
 
481
 
482
  ###
requirements.txt CHANGED
@@ -7,6 +7,8 @@ presidio_anonymizer==2.2.355
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
  pandas==2.2.3
 
 
10
  spacy==3.8.3
11
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
12
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
@@ -15,6 +17,8 @@ boto3==1.35.83
15
  pyarrow==18.1.0
16
  openpyxl==3.1.2
17
  Faker==22.2.0
 
 
18
  gradio_image_annotation==0.2.5
19
  numpy==1.26.4
20
  awslambdaric==3.0.0
 
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
  pandas==2.2.3
10
+ nltk==3.9.1
11
+ scikit-learn==1.5.2
12
  spacy==3.8.3
13
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
14
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 
17
  pyarrow==18.1.0
18
  openpyxl==3.1.2
19
  Faker==22.2.0
20
+ python-levenshtein==0.26.1
21
+ spaczz==0.6.1
22
  gradio_image_annotation==0.2.5
23
  numpy==1.26.4
24
  awslambdaric==3.0.0
tools/custom_image_analyser_engine.py CHANGED
@@ -560,7 +560,7 @@ def run_page_text_redaction(
560
  if not nlp_analyser:
561
  raise ValueError("nlp_analyser is required for Local identification method")
562
 
563
- print("page text:", page_text)
564
 
565
  page_analyser_result = nlp_analyser.analyze(
566
  text=page_text,
@@ -1077,15 +1077,15 @@ class CustomImageAnalyzerEngine:
1077
  line_length = len(line_text)
1078
  redaction_text = redaction_relevant_ocr_result.text
1079
 
1080
- # print(f"Processing line: '{line_text}'")
1081
 
1082
  for redaction_result in text_analyzer_results:
1083
- # print(f"Checking redaction result: {redaction_result}")
1084
- # print("redaction_text:", redaction_text)
1085
- # print("line_length:", line_length)
1086
- # print("line_text:", line_text)
1087
 
1088
- # Check if the redaction text is no in the allow list
1089
 
1090
  if redaction_text not in allow_list:
1091
 
@@ -1098,14 +1098,45 @@ class CustomImageAnalyzerEngine:
1098
  matched_words = matched_text.split()
1099
 
1100
  # print(f"Found match: '{matched_text}' in line")
 
 
 
 
 
 
1101
 
1102
  # Find the corresponding words in the OCR results
1103
  matching_word_boxes = []
 
 
 
 
 
1104
  for word_info in ocr_results_with_children_child_info.get('words', []):
1105
- # Check if this word is part of our match
1106
- if any(word.lower() in word_info['text'].lower() for word in matched_words):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1107
  matching_word_boxes.append(word_info['bounding_box'])
1108
- # print(f"Matched word: {word_info['text']}")
1109
 
1110
  if matching_word_boxes:
1111
  # Calculate the combined bounding box for all matching words
@@ -1127,7 +1158,7 @@ class CustomImageAnalyzerEngine:
1127
  text=matched_text
1128
  )
1129
  )
1130
- # print(f"Added bounding box for: '{matched_text}'")
1131
 
1132
  return redaction_bboxes
1133
 
 
560
  if not nlp_analyser:
561
  raise ValueError("nlp_analyser is required for Local identification method")
562
 
563
+ #print("page text:", page_text)
564
 
565
  page_analyser_result = nlp_analyser.analyze(
566
  text=page_text,
 
1077
  line_length = len(line_text)
1078
  redaction_text = redaction_relevant_ocr_result.text
1079
 
1080
+ #print(f"Processing line: '{line_text}'")
1081
 
1082
  for redaction_result in text_analyzer_results:
1083
+ #print(f"Checking redaction result: {redaction_result}")
1084
+ #print("redaction_text:", redaction_text)
1085
+ #print("line_length:", line_length)
1086
+ #print("line_text:", line_text)
1087
 
1088
+ # Check if the redaction text is not in the allow list
1089
 
1090
  if redaction_text not in allow_list:
1091
 
 
1098
  matched_words = matched_text.split()
1099
 
1100
  # print(f"Found match: '{matched_text}' in line")
1101
+
1102
+ # for word_info in ocr_results_with_children_child_info.get('words', []):
1103
+ # # Check if this word is part of our match
1104
+ # if any(word.lower() in word_info['text'].lower() for word in matched_words):
1105
+ # matching_word_boxes.append(word_info['bounding_box'])
1106
+ # print(f"Matched word: {word_info['text']}")
1107
 
1108
  # Find the corresponding words in the OCR results
1109
  matching_word_boxes = []
1110
+
1111
+ #print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
1112
+
1113
+ current_position = 0
1114
+
1115
  for word_info in ocr_results_with_children_child_info.get('words', []):
1116
+ word_text = word_info['text']
1117
+ word_length = len(word_text)
1118
+
1119
+ # Assign start and end character positions
1120
+ #word_info['start_position'] = current_position
1121
+ #word_info['end_position'] = current_position + word_length
1122
+
1123
+ word_start = current_position
1124
+ word_end = current_position + word_length
1125
+
1126
+ # Update current position for the next word
1127
+ current_position += word_length + 1 # +1 for the space after the word
1128
+
1129
+ #print("word_info['bounding_box']:", word_info['bounding_box'])
1130
+ #print("word_start:", word_start)
1131
+ #print("start_in_line:", start_in_line)
1132
+
1133
+ #print("word_end:", word_end)
1134
+ #print("end_in_line:", end_in_line)
1135
+
1136
+ # Check if the word's bounding box is within the start and end bounds
1137
+ if word_start >= start_in_line and word_end <= (end_in_line + 1):
1138
  matching_word_boxes.append(word_info['bounding_box'])
1139
+ #print(f"Matched word: {word_info['text']}")
1140
 
1141
  if matching_word_boxes:
1142
  # Calculate the combined bounding box for all matching words
 
1158
  text=matched_text
1159
  )
1160
  )
1161
+ #print(f"Added bounding box for: '{matched_text}'")
1162
 
1163
  return redaction_bboxes
1164
 
tools/data_anonymise.py CHANGED
@@ -12,7 +12,7 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerR
12
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
 
15
- from tools.helper_functions import output_folder, get_file_path_end, read_file, detect_file_type
16
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
17
 
18
  # Use custom version of analyze_dict to be able to track progress
@@ -434,7 +434,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
434
  file_type = detect_file_type(anon_file)
435
  print("File type is:", file_type)
436
 
437
- out_file_part = get_file_path_end(anon_file.name)
438
 
439
  if file_type == 'xlsx':
440
  print("Running through all xlsx sheets")
@@ -472,7 +472,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
472
  else:
473
  sheet_name = ""
474
  anon_df = read_file(anon_file)
475
- out_file_part = get_file_path_end(anon_file.name)
476
  out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
477
 
478
  # Increase latest file completed count unless we are at the last file
 
12
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
13
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
14
 
15
+ from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
16
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
17
 
18
  # Use custom version of analyze_dict to be able to track progress
 
434
  file_type = detect_file_type(anon_file)
435
  print("File type is:", file_type)
436
 
437
+ out_file_part = get_file_name_without_type(anon_file.name)
438
 
439
  if file_type == 'xlsx':
440
  print("Running through all xlsx sheets")
 
472
  else:
473
  sheet_name = ""
474
  anon_df = read_file(anon_file)
475
+ out_file_part = get_file_name_without_type(anon_file.name)
476
  out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
477
 
478
  # Increase latest file completed count unless we are at the last file
tools/file_conversion.py CHANGED
@@ -1,5 +1,5 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
@@ -7,6 +7,7 @@ import time
7
  import json
8
  import pymupdf
9
  import pandas as pd
 
10
  from pymupdf import Rect
11
  from fitz import Page
12
  from tqdm import tqdm
@@ -240,7 +241,7 @@ def get_input_file_names(file_input:List[str]):
240
  else:
241
  file_path = file.name
242
 
243
- file_path_without_ext = get_file_path_end(file_path)
244
 
245
  file_extension = os.path.splitext(file_path)[1].lower()
246
 
@@ -489,7 +490,7 @@ def prepare_image_or_pdf(
489
  file_path = file
490
  else:
491
  file_path = file.name
492
- file_path_without_ext = get_file_path_end(file_path)
493
  file_name_with_ext = os.path.basename(file_path)
494
 
495
  if not file_path:
@@ -668,7 +669,7 @@ def prepare_image_or_pdf(
668
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
669
 
670
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
671
- file_path_without_ext = get_file_path_end(in_file_path)
672
 
673
  out_file_paths = out_text_file_path
674
 
@@ -754,7 +755,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
754
  if 'text' not in box:
755
  data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
756
  else:
757
- data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
758
  #print("data_to_add:", data_to_add)
759
  flattened_annotation_data.append(data_to_add)
760
 
@@ -764,7 +765,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
764
  #print("redaction_decision_output:", redaction_decision_output)
765
  #print("annotation_data_as_df:", annotation_data_as_df)
766
 
767
- # Join on additional text data from decision output results if included
768
  if not redaction_decision_output.empty:
769
  #print("redaction_decision_output is not empty")
770
  #print("redaction_decision_output:", redaction_decision_output)
@@ -793,6 +794,9 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
793
  if col not in annotation_data_as_df.columns:
794
  annotation_data_as_df[col] = ''
795
 
 
 
 
796
  annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
797
 
798
  return annotation_data_as_df
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
 
7
  import json
8
  import pymupdf
9
  import pandas as pd
10
+ import numpy as np
11
  from pymupdf import Rect
12
  from fitz import Page
13
  from tqdm import tqdm
 
241
  else:
242
  file_path = file.name
243
 
244
+ file_path_without_ext = get_file_name_without_type(file_path)
245
 
246
  file_extension = os.path.splitext(file_path)[1].lower()
247
 
 
490
  file_path = file
491
  else:
492
  file_path = file.name
493
+ file_path_without_ext = get_file_name_without_type(file_path)
494
  file_name_with_ext = os.path.basename(file_path)
495
 
496
  if not file_path:
 
669
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
670
 
671
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
672
+ file_path_without_ext = get_file_name_without_type(in_file_path)
673
 
674
  out_file_paths = out_text_file_path
675
 
 
755
  if 'text' not in box:
756
  data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
757
  else:
758
+ data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
759
  #print("data_to_add:", data_to_add)
760
  flattened_annotation_data.append(data_to_add)
761
 
 
765
  #print("redaction_decision_output:", redaction_decision_output)
766
  #print("annotation_data_as_df:", annotation_data_as_df)
767
 
768
+ # Join on additional text data from decision output results if included, if text not already there
769
  if not redaction_decision_output.empty:
770
  #print("redaction_decision_output is not empty")
771
  #print("redaction_decision_output:", redaction_decision_output)
 
794
  if col not in annotation_data_as_df.columns:
795
  annotation_data_as_df[col] = ''
796
 
797
+ for col in ['xmin', 'xmax', 'ymin', 'ymax']:
798
+ annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
799
+
800
  annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
801
 
802
  return annotation_data_as_df
tools/file_redaction.py CHANGED
@@ -27,8 +27,8 @@ from presidio_analyzer import RecognizerResult
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
29
  from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
- from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
31
- from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
33
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
34
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
@@ -94,6 +94,8 @@ def choose_and_run_redactor(file_paths:List[str],
94
  page_break_return:bool=False,
95
  pii_identification_method:str="Local",
96
  comprehend_query_number:int=0,
 
 
97
  output_folder:str=output_folder,
98
  progress=gr.Progress(track_tqdm=True)):
99
  '''
@@ -127,6 +129,8 @@ def choose_and_run_redactor(file_paths:List[str],
127
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
128
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
129
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
 
 
130
  - output_folder (str, optional): Output folder for results.
131
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
132
 
@@ -136,7 +140,7 @@ def choose_and_run_redactor(file_paths:List[str],
136
  tic = time.perf_counter()
137
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
138
 
139
- print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
140
  review_out_file_paths = [prepared_pdf_file_paths[0]]
141
 
142
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
@@ -279,9 +283,9 @@ def choose_and_run_redactor(file_paths:List[str],
279
  file_path = file.name
280
 
281
  if file_path:
282
- pdf_file_name_without_ext = get_file_path_end(file_path)
283
  pdf_file_name_with_ext = os.path.basename(file_path)
284
- print("Redacting file:", pdf_file_name_with_ext)
285
 
286
  is_a_pdf = is_pdf(file_path) == True
287
  if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -327,7 +331,9 @@ def choose_and_run_redactor(file_paths:List[str],
327
  comprehend_client,
328
  textract_client,
329
  custom_recogniser_word_list,
330
- redact_whole_page_list)
 
 
331
 
332
 
333
  #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
@@ -366,7 +372,9 @@ def choose_and_run_redactor(file_paths:List[str],
366
  comprehend_query_number,
367
  comprehend_client,
368
  custom_recogniser_word_list,
369
- redact_whole_page_list)
 
 
370
 
371
  else:
372
  out_message = "No redaction method selected"
@@ -414,13 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
414
 
415
  # Save the gradio_annotation_boxes to a JSON file
416
  try:
417
- #print("Saving annotations to JSON")
418
-
419
- out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
420
- with open(out_annotation_file_path, 'w') as f:
421
- json.dump(annotations_all_pages, f)
422
- log_files_output_paths.append(out_annotation_file_path)
423
-
424
  #print("Saving annotations to CSV")
425
 
426
  # Convert json to csv and also save this
@@ -435,6 +437,13 @@ def choose_and_run_redactor(file_paths:List[str],
435
 
436
  print("Saved review file to csv")
437
 
 
 
 
 
 
 
 
438
  except Exception as e:
439
  print("Could not save annotations to json or csv file:", e)
440
 
@@ -694,10 +703,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
694
  x1 = pymupdf_x1
695
  x2 = pymupdf_x2
696
 
697
- # if hasattr(annot, 'text') and annot.text:
698
- # img_annotation_box["text"] = annot.text
699
- # else:
700
- # img_annotation_box["text"] = ""
701
 
702
  # Else should be CustomImageRecognizerResult
703
  else:
@@ -715,10 +724,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
715
  img_annotation_box["label"] = annot.entity_type
716
  except:
717
  img_annotation_box["label"] = "Redaction"
718
- # if hasattr(annot, 'text') and annot.text:
719
- # img_annotation_box["text"] = annot.text
720
- # else:
721
- # img_annotation_box["text"] = ""
 
722
 
723
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
724
 
@@ -749,12 +759,14 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
749
 
750
  if isinstance(annot, Dictionary):
751
  img_annotation_box["label"] = str(annot["/T"])
 
 
 
 
 
752
  else:
753
  img_annotation_box["label"] = "REDACTION"
754
- # if hasattr(annot, 'text') and annot.text:
755
- # img_annotation_box["text"] = annot.text
756
- # else:
757
- # img_annotation_box["text"] = ""
758
 
759
  # Convert to a PyMuPDF Rect object
760
  #rect = Rect(rect_coordinates)
@@ -779,6 +791,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
779
 
780
  return page, out_annotation_boxes
781
 
 
 
 
 
 
782
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
783
 
784
  all_bboxes = []
@@ -908,6 +925,8 @@ def redact_image_pdf(file_path:str,
908
  textract_client:str="",
909
  custom_recogniser_word_list:List[str]=[],
910
  redact_whole_page_list:List[str]=[],
 
 
911
  page_break_val:int=int(page_break_value),
912
  log_files_output_paths:List=[],
913
  max_time:int=int(max_time_value),
@@ -940,14 +959,16 @@ def redact_image_pdf(file_path:str,
940
  - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
941
  - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
942
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
 
 
943
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
944
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
945
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
946
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
947
 
948
- The function returns a fully or partially-redacted PDF document.
949
  '''
950
- file_name = get_file_path_end(file_path)
951
  fill = (0, 0, 0) # Fill colour for redactions
952
  comprehend_query_number_new = 0
953
 
@@ -957,11 +978,14 @@ def redact_image_pdf(file_path:str,
957
  nlp_analyser.registry.remove_recognizer("CUSTOM")
958
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
959
  #print("new_custom_recogniser:", new_custom_recogniser)
960
- nlp_analyser.registry.add_recognizer(new_custom_recogniser)
961
 
 
 
 
 
962
 
963
- image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
964
-
965
 
966
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
967
  print("Connection to AWS Comprehend service unsuccessful.")
@@ -1051,7 +1075,7 @@ def redact_image_pdf(file_path:str,
1051
 
1052
  #print("Image is in range of pages to redact")
1053
  if isinstance(image, str):
1054
- print("image is a file path", image)
1055
  image = Image.open(image)
1056
 
1057
  # Need image size to convert textract OCR outputs to the correct sizes
@@ -1119,7 +1143,7 @@ def redact_image_pdf(file_path:str,
1119
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1120
 
1121
  # Step 2: Analyze text and identify PII
1122
- if chosen_redact_entities:
1123
 
1124
  redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
1125
  line_level_ocr_results,
@@ -1185,6 +1209,7 @@ def redact_image_pdf(file_path:str,
1185
 
1186
  ## Apply annotations with pymupdf
1187
  else:
 
1188
  #print("redact_whole_page_list:", redact_whole_page_list)
1189
  if redact_whole_page_list:
1190
  int_reported_page_number = int(reported_page_number)
@@ -1309,7 +1334,7 @@ def redact_image_pdf(file_path:str,
1309
 
1310
 
1311
  ###
1312
- # PIKEPDF TEXT PDF REDACTION
1313
  ###
1314
 
1315
  def get_text_container_characters(text_container:LTTextContainer):
@@ -1466,6 +1491,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
1466
  def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1467
  pikepdf_annotations_on_page = []
1468
  for analysed_bounding_box in analysed_bounding_boxes:
 
 
1469
  bounding_box = analysed_bounding_box["boundingBox"]
1470
  annotation = Dictionary(
1471
  Type=Name.Annot,
@@ -1477,6 +1504,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1477
  IC=[0, 0, 0],
1478
  CA=1, # Transparency
1479
  T=analysed_bounding_box["result"].entity_type,
 
1480
  BS=Dictionary(
1481
  W=0, # Border width: 1 point
1482
  S=Name.S # Border style: solid
@@ -1485,182 +1513,6 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1485
  pikepdf_annotations_on_page.append(annotation)
1486
  return pikepdf_annotations_on_page
1487
 
1488
- # def run_page_text_redaction(language: str, # Language of the PDF content
1489
- # chosen_redact_entities: List[str], # List of entities to be redacted
1490
- # chosen_redact_comprehend_entities: List[str],
1491
- # line_level_text_results_list: List[str],
1492
- # line_characters: List,
1493
- # page_analyser_results: List = [],
1494
- # page_analysed_bounding_boxes: List = [],
1495
- # comprehend_client = None, # Connection to AWS Comprehend
1496
- # allow_list: List[str] = None, # Optional list of allowed entities
1497
- # pii_identification_method: str = "Local"
1498
- # ):
1499
-
1500
- # # Initialize batching variables
1501
- # current_batch = ""
1502
- # current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
1503
- # all_text_line_results = [] # Store results for all lines
1504
- # text_container_analyser_results = []
1505
- # text_container_analysed_bounding_boxes = []
1506
-
1507
- # # First pass: collect all lines into batches
1508
- # for i, text_line in enumerate(line_level_text_results_list):
1509
- # if chosen_redact_entities:
1510
- # if pii_identification_method == "Local":
1511
-
1512
- # #print("chosen_redact_entities:", chosen_redact_entities)
1513
-
1514
- # # Process immediately for local analysis
1515
- # text_line_analyser_result = nlp_analyser.analyze(
1516
- # text=text_line.text,
1517
- # language=language,
1518
- # entities=chosen_redact_entities,
1519
- # score_threshold=score_threshold,
1520
- # return_decision_process=True,
1521
- # allow_list=allow_list
1522
- # )
1523
- # all_text_line_results.append((i, text_line_analyser_result))
1524
-
1525
-
1526
- # elif pii_identification_method == "AWS Comprehend":
1527
-
1528
- # # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
1529
- # custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
1530
-
1531
-
1532
- # text_line_analyser_result = nlp_analyser.analyze(
1533
- # text=text_line.text,
1534
- # language=language,
1535
- # entities=custom_redact_entities,
1536
- # score_threshold=score_threshold,
1537
- # return_decision_process=True,
1538
- # allow_list=allow_list
1539
- # )
1540
- # all_text_line_results.append((i, text_line_analyser_result))
1541
-
1542
-
1543
- # if len(text_line.text) >= 3:
1544
- # # Add separator between lines
1545
- # if current_batch:
1546
- # current_batch += " | "
1547
-
1548
- # start_pos = len(current_batch)
1549
- # current_batch += text_line.text
1550
- # current_batch_mapping.append((start_pos, i, text_line))
1551
-
1552
- # # Process batch if approaching 300 characters or last line
1553
- # if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
1554
- # print("length of text for Comprehend:", len(current_batch))
1555
-
1556
- # try:
1557
- # response = comprehend_client.detect_pii_entities(
1558
- # Text=current_batch,
1559
- # LanguageCode=language
1560
- # )
1561
- # except Exception as e:
1562
- # print(e)
1563
- # time.sleep(3)
1564
- # response = comprehend_client.detect_pii_entities(
1565
- # Text=current_batch,
1566
- # LanguageCode=language
1567
- # )
1568
-
1569
- # comprehend_query_number += 1
1570
-
1571
- # # Process response and map back to original lines
1572
- # if response and "Entities" in response:
1573
- # for entity in response["Entities"]:
1574
- # entity_start = entity["BeginOffset"]
1575
- # entity_end = entity["EndOffset"]
1576
-
1577
- # # Find which line this entity belongs to
1578
- # for batch_start, line_idx, original_line in current_batch_mapping:
1579
- # batch_end = batch_start + len(original_line.text)
1580
-
1581
- # # Check if entity belongs to this line
1582
- # if batch_start <= entity_start < batch_end:
1583
- # # Adjust offsets relative to original line
1584
- # relative_start = entity_start - batch_start
1585
- # relative_end = min(entity_end - batch_start, len(original_line.text))
1586
-
1587
- # result_text = original_line.text[relative_start:relative_end]
1588
-
1589
- # if result_text not in allow_list:
1590
- # if entity.get("Type") in chosen_redact_comprehend_entities:
1591
- # # Create adjusted entity
1592
- # adjusted_entity = entity.copy()
1593
- # adjusted_entity["BeginOffset"] = relative_start
1594
- # adjusted_entity["EndOffset"] = relative_end
1595
-
1596
- # recogniser_entity = recognizer_result_from_dict(adjusted_entity)
1597
-
1598
- # # Add to results for this line
1599
- # existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
1600
- # if not existing_results:
1601
- # all_text_line_results.append((line_idx, [recogniser_entity]))
1602
- # else:
1603
- # existing_results.append(recogniser_entity)
1604
-
1605
- # # Reset batch
1606
- # current_batch = ""
1607
- # current_batch_mapping = []
1608
-
1609
- # # Second pass: process results for each line
1610
- # for i, text_line in enumerate(line_level_text_results_list):
1611
- # text_line_analyser_result = []
1612
- # text_line_bounding_boxes = []
1613
-
1614
- # # Get results for this line
1615
- # line_results = next((results for idx, results in all_text_line_results if idx == i), [])
1616
-
1617
- # if line_results:
1618
- # text_line_analyser_result = line_results
1619
-
1620
- # #print("Analysed text container, now merging bounding boxes")
1621
-
1622
- # # Merge bounding boxes if very close together
1623
- # text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
1624
-
1625
- # #print("merged bounding boxes")
1626
-
1627
- # text_container_analyser_results.extend(text_line_analyser_result)
1628
- # #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1629
-
1630
- # #print("text_container_analyser_results:", text_container_analyser_results)
1631
-
1632
- # page_analyser_results.extend(text_container_analyser_results) # Add this line
1633
- # page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
1634
-
1635
- # return page_analysed_bounding_boxes
1636
-
1637
- # def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
1638
- # for entity in page_analyser_result:
1639
- # entity_start = entity.start
1640
- # entity_end = entity.end
1641
-
1642
- # for batch_start, line_idx, original_line, chars in page_text_mapping:
1643
- # batch_end = batch_start + len(original_line.text)
1644
-
1645
- # if batch_start <= entity_start < batch_end:
1646
- # relative_start = entity_start - batch_start
1647
- # relative_end = min(entity_end - batch_start, len(original_line.text))
1648
-
1649
- # adjusted_entity = copy.deepcopy(entity)
1650
- # adjusted_entity.start = relative_start
1651
- # adjusted_entity.end = relative_end
1652
-
1653
- # existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
1654
-
1655
- # if existing_entry is None:
1656
- # all_text_line_results.append((line_idx, [adjusted_entity]))
1657
- # else:
1658
- # existing_entry.append(adjusted_entity)
1659
- # break
1660
-
1661
- # return all_text_line_results
1662
-
1663
-
1664
  def redact_text_pdf(
1665
  filename: str, # Path to the PDF file to be redacted
1666
  prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
@@ -1682,6 +1534,8 @@ def redact_text_pdf(
1682
  comprehend_client="",
1683
  custom_recogniser_word_list:List[str]=[],
1684
  redact_whole_page_list:List[str]=[],
 
 
1685
  page_break_val: int = int(page_break_value), # Value for page break
1686
  max_time: int = int(max_time_value),
1687
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
@@ -1711,6 +1565,8 @@ def redact_text_pdf(
1711
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
1712
  - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
1713
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
 
 
1714
  - page_break_val: Value for page break
1715
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1716
  - progress: Progress tracking object
@@ -1726,9 +1582,12 @@ def redact_text_pdf(
1726
  if custom_recogniser_word_list:
1727
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1728
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
1729
- #print("new_custom_recogniser:", new_custom_recogniser)
1730
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1731
 
 
 
 
 
1732
  # List all elements currently in the nlp_analyser registry
1733
  #print("Current recognizers in nlp_analyser registry:")
1734
  #for recognizer_name in nlp_analyser.registry.recognizers:
@@ -1761,15 +1620,14 @@ def redact_text_pdf(
1761
  for page_no in progress_bar:
1762
 
1763
  reported_page_number = str(page_no + 1)
1764
- print("Redacting page:", reported_page_number)
1765
 
1766
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
1767
  try:
1768
  image = prepared_pdf_image_path[page_no]#.copy()
1769
  #print("image:", image)
1770
  except Exception as e:
1771
- print("Could not redact page:", reported_page_number, "due to:")
1772
- print(e)
1773
  continue
1774
 
1775
  image_annotations = {"image": image, "boxes": []}
@@ -1825,27 +1683,32 @@ def redact_text_pdf(
1825
 
1826
  ### REDACTION
1827
 
1828
- page_analysed_bounding_boxes = run_page_text_redaction(
1829
- language,
1830
- chosen_redact_entities,
1831
- chosen_redact_comprehend_entities,
1832
- all_line_level_text_results_list, #line_level_text_results_list,
1833
- all_line_characters,
1834
- page_analyser_results,
1835
- page_analysed_bounding_boxes,
1836
- comprehend_client,
1837
- allow_list,
1838
- pii_identification_method,
1839
- nlp_analyser,
1840
- score_threshold,
1841
- custom_entities,
1842
- comprehend_query_number
1843
- )
1844
-
1845
-
1846
- #print("page_analyser_results:", page_analyser_results)
1847
- #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
1848
- #print("image:", image)
 
 
 
 
 
1849
 
1850
  page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
1851
 
@@ -1854,7 +1717,7 @@ def redact_text_pdf(
1854
  # Annotate redactions on page
1855
  pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1856
 
1857
- #print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
1858
 
1859
  # Make pymupdf page redactions
1860
  #print("redact_whole_page_list:", redact_whole_page_list)
 
27
  from tools.aws_functions import RUN_AWS_FUNCTIONS
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
29
  from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
+ from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
31
+ from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
33
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
34
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
 
94
  page_break_return:bool=False,
95
  pii_identification_method:str="Local",
96
  comprehend_query_number:int=0,
97
+ max_fuzzy_spelling_mistakes_num:int=1,
98
+ match_fuzzy_whole_phrase_bool:bool=True,
99
  output_folder:str=output_folder,
100
  progress=gr.Progress(track_tqdm=True)):
101
  '''
 
129
  - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
130
  - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
131
  - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
132
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
133
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
134
  - output_folder (str, optional): Output folder for results.
135
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
136
 
 
140
  tic = time.perf_counter()
141
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
142
 
143
+ #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
144
  review_out_file_paths = [prepared_pdf_file_paths[0]]
145
 
146
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
 
283
  file_path = file.name
284
 
285
  if file_path:
286
+ pdf_file_name_without_ext = get_file_name_without_type(file_path)
287
  pdf_file_name_with_ext = os.path.basename(file_path)
288
+ # print("Redacting file:", pdf_file_name_with_ext)
289
 
290
  is_a_pdf = is_pdf(file_path) == True
291
  if is_a_pdf == False and in_redact_method == text_ocr_option:
 
331
  comprehend_client,
332
  textract_client,
333
  custom_recogniser_word_list,
334
+ redact_whole_page_list,
335
+ max_fuzzy_spelling_mistakes_num,
336
+ match_fuzzy_whole_phrase_bool)
337
 
338
 
339
  #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
 
372
  comprehend_query_number,
373
  comprehend_client,
374
  custom_recogniser_word_list,
375
+ redact_whole_page_list,
376
+ max_fuzzy_spelling_mistakes_num,
377
+ match_fuzzy_whole_phrase_bool)
378
 
379
  else:
380
  out_message = "No redaction method selected"
 
422
 
423
  # Save the gradio_annotation_boxes to a JSON file
424
  try:
425
+
 
 
 
 
 
 
426
  #print("Saving annotations to CSV")
427
 
428
  # Convert json to csv and also save this
 
437
 
438
  print("Saved review file to csv")
439
 
440
+ out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
441
+ with open(out_annotation_file_path, 'w') as f:
442
+ json.dump(annotations_all_pages, f)
443
+ log_files_output_paths.append(out_annotation_file_path)
444
+
445
+ print("Saving annotations to JSON")
446
+
447
  except Exception as e:
448
  print("Could not save annotations to json or csv file:", e)
449
 
 
703
  x1 = pymupdf_x1
704
  x2 = pymupdf_x2
705
 
706
+ if hasattr(annot, 'text') and annot.text:
707
+ img_annotation_box["text"] = annot.text
708
+ else:
709
+ img_annotation_box["text"] = ""
710
 
711
  # Else should be CustomImageRecognizerResult
712
  else:
 
724
  img_annotation_box["label"] = annot.entity_type
725
  except:
726
  img_annotation_box["label"] = "Redaction"
727
+
728
+ if hasattr(annot, 'text') and annot.text:
729
+ img_annotation_box["text"] = annot.text
730
+ else:
731
+ img_annotation_box["text"] = ""
732
 
733
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2) # Create the PyMuPDF Rect
734
 
 
759
 
760
  if isinstance(annot, Dictionary):
761
  img_annotation_box["label"] = str(annot["/T"])
762
+
763
+ if hasattr(annot, 'Contents'):
764
+ img_annotation_box["text"] = annot.Contents
765
+ else:
766
+ img_annotation_box["text"] = ""
767
  else:
768
  img_annotation_box["label"] = "REDACTION"
769
+ img_annotation_box["text"] = ""
 
 
 
770
 
771
  # Convert to a PyMuPDF Rect object
772
  #rect = Rect(rect_coordinates)
 
791
 
792
  return page, out_annotation_boxes
793
 
794
+ ###
795
+ # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
796
+ ###
797
+
798
+
799
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
800
 
801
  all_bboxes = []
 
925
  textract_client:str="",
926
  custom_recogniser_word_list:List[str]=[],
927
  redact_whole_page_list:List[str]=[],
928
+ max_fuzzy_spelling_mistakes_num:int=1,
929
+ match_fuzzy_whole_phrase_bool:bool=True,
930
  page_break_val:int=int(page_break_value),
931
  log_files_output_paths:List=[],
932
  max_time:int=int(max_time_value),
 
959
  - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
960
  - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
961
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
962
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
963
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
964
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
965
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
966
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
967
  - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
968
 
969
+ The function returns a redacted PDF document along with processing output objects.
970
  '''
971
+ file_name = get_file_name_without_type(file_path)
972
  fill = (0, 0, 0) # Fill colour for redactions
973
  comprehend_query_number_new = 0
974
 
 
978
  nlp_analyser.registry.remove_recognizer("CUSTOM")
979
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
980
  #print("new_custom_recogniser:", new_custom_recogniser)
981
+ nlp_analyser.registry.add_recognizer(new_custom_recogniser)
982
 
983
+ nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
984
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
985
+ #print("new_custom_recogniser:", new_custom_recogniser)
986
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
987
 
988
+ image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
 
989
 
990
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
991
  print("Connection to AWS Comprehend service unsuccessful.")
 
1075
 
1076
  #print("Image is in range of pages to redact")
1077
  if isinstance(image, str):
1078
+ #print("image is a file path", image)
1079
  image = Image.open(image)
1080
 
1081
  # Need image size to convert textract OCR outputs to the correct sizes
 
1143
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1144
 
1145
  # Step 2: Analyze text and identify PII
1146
+ if chosen_redact_entities or chosen_redact_comprehend_entities:
1147
 
1148
  redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
1149
  line_level_ocr_results,
 
1209
 
1210
  ## Apply annotations with pymupdf
1211
  else:
1212
+ print("merged_redaction_boxes:", merged_redaction_bboxes)
1213
  #print("redact_whole_page_list:", redact_whole_page_list)
1214
  if redact_whole_page_list:
1215
  int_reported_page_number = int(reported_page_number)
 
1334
 
1335
 
1336
  ###
1337
+ # PIKEPDF TEXT DETECTION/REDACTION
1338
  ###
1339
 
1340
  def get_text_container_characters(text_container:LTTextContainer):
 
1491
  def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1492
  pikepdf_annotations_on_page = []
1493
  for analysed_bounding_box in analysed_bounding_boxes:
1494
+ #print("analysed_bounding_box:", analysed_bounding_boxes)
1495
+
1496
  bounding_box = analysed_bounding_box["boundingBox"]
1497
  annotation = Dictionary(
1498
  Type=Name.Annot,
 
1504
  IC=[0, 0, 0],
1505
  CA=1, # Transparency
1506
  T=analysed_bounding_box["result"].entity_type,
1507
+ Contents=analysed_bounding_box["text"],
1508
  BS=Dictionary(
1509
  W=0, # Border width: 1 point
1510
  S=Name.S # Border style: solid
 
1513
  pikepdf_annotations_on_page.append(annotation)
1514
  return pikepdf_annotations_on_page
1515
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1516
  def redact_text_pdf(
1517
  filename: str, # Path to the PDF file to be redacted
1518
  prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
 
1534
  comprehend_client="",
1535
  custom_recogniser_word_list:List[str]=[],
1536
  redact_whole_page_list:List[str]=[],
1537
+ max_fuzzy_spelling_mistakes_num:int=1,
1538
+ match_fuzzy_whole_phrase_bool:bool=True,
1539
  page_break_val: int = int(page_break_value), # Value for page break
1540
  max_time: int = int(max_time_value),
1541
  progress: Progress = Progress(track_tqdm=True) # Progress tracking object
 
1565
  - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
1566
  - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
1567
  - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
1568
+ - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
1569
+ - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
1570
  - page_break_val: Value for page break
1571
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
1572
  - progress: Progress tracking object
 
1582
  if custom_recogniser_word_list:
1583
  nlp_analyser.registry.remove_recognizer("CUSTOM")
1584
  new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
 
1585
  nlp_analyser.registry.add_recognizer(new_custom_recogniser)
1586
 
1587
+ nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
1588
+ new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
1589
+ nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
1590
+
1591
  # List all elements currently in the nlp_analyser registry
1592
  #print("Current recognizers in nlp_analyser registry:")
1593
  #for recognizer_name in nlp_analyser.registry.recognizers:
 
1620
  for page_no in progress_bar:
1621
 
1622
  reported_page_number = str(page_no + 1)
1623
+ #print("Redacting page:", reported_page_number)
1624
 
1625
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
1626
  try:
1627
  image = prepared_pdf_image_path[page_no]#.copy()
1628
  #print("image:", image)
1629
  except Exception as e:
1630
+ print("Could not redact page:", reported_page_number, "due to:", e)
 
1631
  continue
1632
 
1633
  image_annotations = {"image": image, "boxes": []}
 
1683
 
1684
  ### REDACTION
1685
 
1686
+ if chosen_redact_entities or chosen_redact_comprehend_entities:
1687
+ #print("Identifying redactions on page.")
1688
+
1689
+ page_analysed_bounding_boxes = run_page_text_redaction(
1690
+ language,
1691
+ chosen_redact_entities,
1692
+ chosen_redact_comprehend_entities,
1693
+ all_line_level_text_results_list,
1694
+ all_line_characters,
1695
+ page_analyser_results,
1696
+ page_analysed_bounding_boxes,
1697
+ comprehend_client,
1698
+ allow_list,
1699
+ pii_identification_method,
1700
+ nlp_analyser,
1701
+ score_threshold,
1702
+ custom_entities,
1703
+ comprehend_query_number
1704
+ )
1705
+
1706
+ #print("page_analyser_results:", page_analyser_results)
1707
+ #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
1708
+ #print("image:", image)
1709
+ else:
1710
+ page_analysed_bounding_boxes = []
1711
+
1712
 
1713
  page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
1714
 
 
1717
  # Annotate redactions on page
1718
  pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
1719
 
1720
+ # print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
1721
 
1722
  # Make pymupdf page redactions
1723
  #print("redact_whole_page_list:", redact_whole_page_list)
tools/find_duplicate_pages.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import argparse
3
+ import glob
4
+ import os
5
+ import re
6
+ from tools.helper_functions import output_folder
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import nltk
10
+ from nltk.corpus import stopwords
11
+ from nltk.tokenize import word_tokenize
12
+ from nltk.stem import PorterStemmer
13
+ import numpy as np
14
+ import random
15
+ import string
16
+ from typing import List
17
+
18
+ nltk.download('punkt')
19
+ nltk.download('stopwords')
20
+ nltk.download('punkt_tab')
21
+
22
+ similarity_threshold = 0.9
23
+
24
+ stop_words = set(stopwords.words('english'))
25
+ # List of words to remove from the stopword set
26
+ #words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]
27
+
28
+ # Remove the specified words from the stopwords set
29
+ #for word in words_to_remove:
30
+ # stop_words.discard(word.lower())
31
+
32
+ stemmer = PorterStemmer()
33
+ vectorizer = TfidfVectorizer()
34
+
35
+ def combine_ocr_output_text(input_files):
36
+ """
37
+ Combines text from multiple CSV files containing page and text columns.
38
+ Groups text by file and page number, concatenating text within these groups.
39
+
40
+ Args:
41
+ input_files (list): List of paths to CSV files
42
+
43
+ Returns:
44
+ pd.DataFrame: Combined dataframe with columns [file, page, text]
45
+ """
46
+ all_data = []
47
+ output_files = []
48
+
49
+ if isinstance(input_files, str):
50
+ file_paths_list = [input_files]
51
+ else:
52
+ file_paths_list = input_files
53
+
54
+ for file in file_paths_list:
55
+
56
+ if isinstance(file, str):
57
+ file_path = file
58
+ else:
59
+ file_path = file.name
60
+
61
+ # Read CSV file
62
+ df = pd.read_csv(file_path)
63
+
64
+ # Ensure required columns exist
65
+ if 'page' not in df.columns or 'text' not in df.columns:
66
+ print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
67
+ continue
68
+
69
+ # Group by page and concatenate text
70
+ grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
71
+
72
+ # Add filename column
73
+ grouped['file'] = os.path.basename(file_path)
74
+
75
+ all_data.append(grouped)
76
+
77
+ if not all_data:
78
+ raise ValueError("No valid CSV files were processed")
79
+
80
+ # Combine all dataframes
81
+ combined_df = pd.concat(all_data, ignore_index=True)
82
+
83
+ # Reorder columns
84
+ combined_df = combined_df[['file', 'page', 'text']]
85
+
86
+ output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
87
+ combined_df.to_csv(output_combined_file_path, index=None)
88
+
89
+ output_files.append(output_combined_file_path)
90
+
91
+ return combined_df, output_files
92
+
93
+ def process_data(df, column:str):
94
+ '''
95
+ Clean and stem text columns in a data frame
96
+ '''
97
+
98
+ def _clean_text(raw_text):
99
+ # Remove HTML tags
100
+ clean = re.sub(r'<.*?>', '', raw_text)
101
+ clean = re.sub(r'&nbsp;', ' ', clean)
102
+ clean = re.sub(r'\r\n', ' ', clean)
103
+ clean = re.sub(r'&lt;', ' ', clean)
104
+ clean = re.sub(r'&gt;', ' ', clean)
105
+ clean = re.sub(r'<strong>', ' ', clean)
106
+ clean = re.sub(r'</strong>', ' ', clean)
107
+
108
+ # Replace non-breaking space \xa0 with a space
109
+ clean = clean.replace(u'\xa0', u' ')
110
+ # Remove extra whitespace
111
+ clean = ' '.join(clean.split())
112
+
113
+ # Tokenize the text
114
+ words = word_tokenize(clean.lower())
115
+
116
+ # Remove punctuation and numbers
117
+ words = [word for word in words if word.isalpha()]
118
+
119
+ # Remove stopwords
120
+ words = [word for word in words if word not in stop_words]
121
+
122
+ # Join the cleaned words back into a string
123
+ return ' '.join(words)
124
+
125
+ # Function to apply stemming
126
+ def _apply_stemming(text):
127
+ # Tokenize the text
128
+ words = word_tokenize(text.lower())
129
+
130
+ # Apply stemming to each word
131
+ stemmed_words = [stemmer.stem(word) for word in words]
132
+
133
+ # Join the stemmed words back into a single string
134
+ return ' '.join(stemmed_words)
135
+
136
+
137
+
138
+
139
+ df['text_clean'] = df[column].apply(_clean_text)
140
+ df['text_clean'] = df['text_clean'].apply(_apply_stemming)
141
+
142
+ return df
143
+
144
+ def identify_similar_pages(input_files:List[str]):
145
+
146
+ output_paths = []
147
+
148
+ df, output_files = combine_ocr_output_text(input_files)
149
+
150
+ output_paths.extend(output_files)
151
+
152
+ # Clean text
153
+ df = process_data(df, 'text')
154
+
155
+ # Vectorise text
156
+ tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
157
+
158
+ # Calculate cosine similarity
159
+ similarity_matrix = cosine_similarity(tfidf_matrix)
160
+
161
+ # Find the indices of the most similar pages
162
+ np.fill_diagonal(similarity_matrix, 0) # Ignore self-comparisons
163
+ similar_pages = np.argwhere(similarity_matrix > similarity_threshold) # Threshold of similarity
164
+
165
+ #print(similar_pages)
166
+
167
+ # Create a DataFrame for similar pairs and their scores
168
+ similarity_df = pd.DataFrame({
169
+ 'Page1_Index': similar_pages[:, 0],
170
+ 'Page2_Index': similar_pages[:, 1],
171
+ 'Page1_File': similar_pages[:, 0],
172
+ 'Page2_File': similar_pages[:, 1],
173
+ 'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
174
+ })
175
+
176
+ # Filter out duplicate pairs (keep only one direction)
177
+ similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
178
+
179
+ # Map the indices to their corresponding text and metadata
180
+ similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
181
+ similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])
182
+
183
+ similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
184
+ similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])
185
+
186
+ similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
187
+ similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])
188
+
189
+ similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
190
+ similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
191
+
192
+ # Save detailed results to a CSV file
193
+ similarity_file_output_path = output_folder + 'page_similarity_results.csv'
194
+ similarity_df_out.to_csv(similarity_file_output_path, index=False)
195
+
196
+ output_paths.append(similarity_file_output_path)
197
+
198
+ if not similarity_df_out.empty:
199
+ unique_files = similarity_df_out['Page2_File'].unique()
200
+ for redact_file in unique_files:
201
+ output_file_name = output_folder + redact_file + "_whole_page.csv"
202
+ whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
203
+ whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)
204
+
205
+ output_paths.append(output_file_name)
206
+
207
+
208
+ return similarity_df_out, output_paths
209
+
210
+ # Perturb text
211
+ # Apply the perturbation function with a 10% error probability
212
+ def perturb_text_with_errors(series):
213
+
214
+ def _perturb_text(text, error_probability=0.1):
215
+ words = text.split() # Split text into words
216
+ perturbed_words = []
217
+
218
+ for word in words:
219
+ if random.random() < error_probability: # Add a random error
220
+ perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
221
+
222
+ if perturbation_type == 'char_error': # Introduce a character error
223
+ idx = random.randint(0, len(word) - 1)
224
+ char = random.choice(string.ascii_lowercase) # Add a random letter
225
+ word = word[:idx] + char + word[idx:]
226
+
227
+ elif perturbation_type == 'extra_space': # Add extra space around a word
228
+ word = ' ' + word + ' '
229
+
230
+ elif perturbation_type == 'extra_punctuation': # Add punctuation to the word
231
+ punctuation = random.choice(string.punctuation)
232
+ idx = random.randint(0, len(word)) # Insert punctuation randomly
233
+ word = word[:idx] + punctuation + word[idx:]
234
+
235
+ perturbed_words.append(word)
236
+
237
+ return ' '.join(perturbed_words)
238
+
239
+ series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
240
+
241
+ return series
242
+
243
+ # Run through command line
244
+ # def main():
245
+ # parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
246
+ # parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
247
+ # parser.add_argument('--output', '-o', default='combined_text.csv',
248
+ # help='Output CSV file path (default: combined_text.csv)')
249
+
250
+ # args = parser.parse_args()
251
+
252
+ # # Get list of input files
253
+ # input_files = glob.glob(args.input_pattern)
254
+
255
+ # if not input_files:
256
+ # print(f"No files found matching pattern: {args.input_pattern}")
257
+ # return
258
+
259
+ # print(f"Processing {len(input_files)} files...")
260
+
261
+ # try:
262
+ # # Combine the text from all files
263
+ # combined_df = combine_ocr_output_text(input_files)
264
+
265
+ # # Save to CSV
266
+ # combined_df.to_csv(args.output, index=False)
267
+ # print(f"Successfully created combined output: {args.output}")
268
+ # print(f"Total pages processed: {len(combined_df)}")
269
+
270
+ # except Exception as e:
271
+ # print(f"Error processing files: {str(e)}")
272
+
273
+ # if __name__ == "__main__":
274
+ # main()
tools/helper_functions.py CHANGED
@@ -4,23 +4,12 @@ import boto3
4
  from botocore.exceptions import ClientError
5
  import gradio as gr
6
  import pandas as pd
 
7
  import unicodedata
8
  from typing import List
9
  from gradio_image_annotation import image_annotator
10
  from tools.auth import user_pool_id
11
 
12
- def reset_state_vars():
13
- return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
14
- label="Modify redaction boxes",
15
- label_list=["Redaction"],
16
- label_colors=[(0, 0, 0)],
17
- show_label=False,
18
- sources=None,#["upload"],
19
- show_clear_button=False,
20
- show_share_button=False,
21
- show_remove_button=False,
22
- interactive=False
23
- ), [], []
24
 
25
  def get_or_create_env_var(var_name, default_value):
26
  # Get the environment variable if it exists
@@ -48,13 +37,40 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
48
  input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
49
  print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def load_in_default_allow_list(allow_list_file_path):
52
  if isinstance(allow_list_file_path, str):
53
  allow_list_file_path = [allow_list_file_path]
54
  return allow_list_file_path
55
 
56
 
57
- def get_file_path_end(file_path):
58
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
59
  basename = os.path.basename(file_path)
60
 
@@ -81,6 +97,8 @@ def detect_file_type(filename):
81
  return 'jpeg'
82
  elif filename.endswith('.png'):
83
  return 'png'
 
 
84
  else:
85
  raise ValueError("Unsupported file type.")
86
 
@@ -121,7 +139,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
121
  if regex_file_names:
122
  regex_file_name = regex_file_names[0]
123
  custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
124
- #regex_file_name_no_ext = get_file_path_end(regex_file_name)
125
 
126
  custom_regex.columns = custom_regex.columns.astype(str)
127
 
@@ -215,13 +233,41 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
215
  except Exception as e:
216
  print("Could not remove usage logs file", e)
217
 
218
- # Retrieving or setting CUSTOM_HEADER
219
- CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
220
- print(f'CUSTOM_HEADER found')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
 
222
- # Retrieving or setting CUSTOM_HEADER_VALUE
223
- CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
224
- print(f'CUSTOM_HEADER_VALUE found')
225
 
226
  async def get_connection_params(request: gr.Request):
227
  base_folder = ""
 
4
  from botocore.exceptions import ClientError
5
  import gradio as gr
6
  import pandas as pd
7
+ import numpy as np
8
  import unicodedata
9
  from typing import List
10
  from gradio_image_annotation import image_annotator
11
  from tools.auth import user_pool_id
12
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def get_or_create_env_var(var_name, default_value):
15
  # Get the environment variable if it exists
 
37
  input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
38
  print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
39
 
40
+ # Retrieving or setting CUSTOM_HEADER
41
+ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
42
+ print(f'CUSTOM_HEADER found')
43
+
44
+ # Retrieving or setting CUSTOM_HEADER_VALUE
45
+ CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
46
+ print(f'CUSTOM_HEADER_VALUE found')
47
+
48
+
49
+ def reset_state_vars():
50
+ return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
51
+ label="Modify redaction boxes",
52
+ label_list=["Redaction"],
53
+ label_colors=[(0, 0, 0)],
54
+ show_label=False,
55
+ sources=None,#["upload"],
56
+ show_clear_button=False,
57
+ show_share_button=False,
58
+ show_remove_button=False,
59
+ interactive=False
60
+ ), [], [], [], pd.DataFrame(), pd.DataFrame()
61
+
62
+ def reset_review_vars():
63
+ return [], pd.DataFrame(), pd.DataFrame()
64
+
65
+
66
+
67
  def load_in_default_allow_list(allow_list_file_path):
68
  if isinstance(allow_list_file_path, str):
69
  allow_list_file_path = [allow_list_file_path]
70
  return allow_list_file_path
71
 
72
 
73
+ def get_file_name_without_type(file_path):
74
  # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
75
  basename = os.path.basename(file_path)
76
 
 
97
  return 'jpeg'
98
  elif filename.endswith('.png'):
99
  return 'png'
100
+ elif filename.endswith('.xfdf'):
101
+ return 'xfdf'
102
  else:
103
  raise ValueError("Unsupported file type.")
104
 
 
139
  if regex_file_names:
140
  regex_file_name = regex_file_names[0]
141
  custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
142
+ #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
143
 
144
  custom_regex.columns = custom_regex.columns.astype(str)
145
 
 
233
  except Exception as e:
234
  print("Could not remove usage logs file", e)
235
 
236
+ def merge_csv_files(file_list):
237
+
238
+ # Initialise an empty list to hold DataFrames
239
+ dataframes = []
240
+ output_files = []
241
+
242
+ # Loop through each file in the file list
243
+ for file in file_list:
244
+ # Read the CSV file into a DataFrame
245
+ df = pd.read_csv(file.name)
246
+ dataframes.append(df)
247
+
248
+ # Concatenate all DataFrames into a single DataFrame
249
+ merged_df = pd.concat(dataframes, ignore_index=True)
250
+
251
+ for col in ['xmin', 'xmax', 'ymin', 'ymax']:
252
+ merged_df[col] = np.floor(merged_df[col])
253
+
254
+ merged_df = merged_df.drop_duplicates(subset=['page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax'])
255
+
256
+ merged_df = merged_df.sort_values(['page', 'ymin', 'xmin', 'label'])
257
+
258
+ file_out_name = os.path.basename(file_list[0])
259
+
260
+ merged_csv_path = output_folder + file_out_name + "_merged.csv"
261
+
262
+ # Save the merged DataFrame to a CSV file
263
+ #merged_csv = StringIO()
264
+ merged_df.to_csv(merged_csv_path, index=False)
265
+ output_files.append(merged_csv_path)
266
+ #merged_csv.seek(0) # Move to the beginning of the StringIO object
267
+
268
+ return output_files
269
+
270
 
 
 
 
271
 
272
  async def get_connection_params(request: gr.Request):
273
  base_folder = ""
tools/load_spacy_model_custom_recognisers.py CHANGED
@@ -3,9 +3,13 @@ from typing import List
3
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
  import spacy
 
 
6
  spacy.prefer_gpu()
7
  from spacy.cli.download import download
 
8
  import re
 
9
 
10
  model_name = "en_core_web_sm" #"en_core_web_trf"
11
  score_threshold = 0.001
@@ -65,16 +69,8 @@ ukpostcode_pattern = Pattern(
65
  # Define the recognizer with one or more patterns
66
  ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
67
 
68
- # %%
69
- # Examples for testing
70
-
71
- #text = "I live in 510 Broad st SE5 9NG ."
72
-
73
- #numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
74
- #print("Result:")
75
- #print(numbers_result)
76
 
77
- # %%
78
  def extract_street_name(text:str) -> str:
79
  """
80
  Extracts the street name and preceding word (that should contain at least one number) from the given text.
@@ -101,7 +97,7 @@ def extract_street_name(text:str) -> str:
101
  pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
102
 
103
  # Find all matches in text
104
- matches = re.finditer(pattern, text, re.IGNORECASE)
105
 
106
  start_positions = []
107
  end_positions = []
@@ -120,19 +116,6 @@ def extract_street_name(text:str) -> str:
120
 
121
  return start_positions, end_positions
122
 
123
-
124
- # %%
125
- # Some examples for testing
126
-
127
- #text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
128
- #text = "Roberto lives in Five 10 Broad st in Oregon"
129
- #text = "Roberto lives in 55 Oregon Square"
130
- #text = "There is 51a no way I will do that"
131
- #text = "I am writing to apply for"
132
-
133
- #extract_street_name(text)
134
-
135
- # %%
136
  class StreetNameRecognizer(EntityRecognizer):
137
 
138
  def load(self) -> None:
@@ -163,14 +146,181 @@ class StreetNameRecognizer(EntityRecognizer):
163
 
164
  street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
165
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  # Create a class inheriting from SpacyNlpEngine
167
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
168
  def __init__(self, loaded_spacy_model):
169
  super().__init__()
170
  self.nlp = {"en": loaded_spacy_model}
171
 
172
-
173
-
174
  # Pass the loaded model to the new LoadedSpacyNlpEngine
175
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
176
 
@@ -186,4 +336,5 @@ nlp_analyser.registry.add_recognizer(street_recogniser)
186
  nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
187
  nlp_analyser.registry.add_recognizer(titles_recogniser)
188
  nlp_analyser.registry.add_recognizer(custom_recogniser)
 
189
 
 
3
  from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
4
  from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
5
  import spacy
6
+ from spacy.matcher import Matcher, PhraseMatcher
7
+ from spaczz.matcher import FuzzyMatcher
8
  spacy.prefer_gpu()
9
  from spacy.cli.download import download
10
+ import Levenshtein
11
  import re
12
+ import gradio as gr
13
 
14
  model_name = "en_core_web_sm" #"en_core_web_trf"
15
  score_threshold = 0.001
 
69
  # Define the recognizer with one or more patterns
70
  ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
71
 
72
+ ### Street name
 
 
 
 
 
 
 
73
 
 
74
  def extract_street_name(text:str) -> str:
75
  """
76
  Extracts the street name and preceding word (that should contain at least one number) from the given text.
 
97
  pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
98
 
99
  # Find all matches in text
100
+ matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
101
 
102
  start_positions = []
103
  end_positions = []
 
116
 
117
  return start_positions, end_positions
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  class StreetNameRecognizer(EntityRecognizer):
120
 
121
  def load(self) -> None:
 
146
 
147
  street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
148
 
149
+ ## Custom fuzzy match recogniser for list of strings
150
+ def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
151
+ # Create regex pattern, handling quotes carefully
152
+
153
+ quote_str = '"'
154
+ replace_str = '(?:"|"|")'
155
+
156
+ custom_regex_pattern = '|'.join(
157
+ rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
158
+ for term in custom_list
159
+ )
160
+
161
+ # Find all matches in text
162
+ matches = re.finditer(custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
163
+
164
+ start_positions = []
165
+ end_positions = []
166
+
167
+ for match in matches:
168
+ start_pos = match.start()
169
+ end_pos = match.end()
170
+
171
+ start_positions.append(start_pos)
172
+ end_positions.append(end_pos)
173
+
174
+ return start_positions, end_positions
175
+
176
+ def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
177
+ ''' Conduct fuzzy match on a list of text data.'''
178
+
179
+ all_matches = []
180
+ all_start_positions = []
181
+ all_end_positions = []
182
+ all_ratios = []
183
+
184
+ #print("custom_query_list:", custom_query_list)
185
+
186
+ if not text:
187
+ out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
188
+ print(out_message)
189
+ return out_message, None
190
+
191
+ for string_query in custom_query_list:
192
+
193
+ #print("text:", text)
194
+ #print("string_query:", string_query)
195
+
196
+ query = nlp(string_query)
197
+
198
+ if search_whole_phrase == False:
199
+ # Keep only words that are not stop words
200
+ token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
201
+
202
+ spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
203
+
204
+ #print("token_query:", token_query)
205
+
206
+ if len(token_query) > 1:
207
+ #pattern_lemma = [{"LEMMA": {"IN": query}}]
208
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
209
+ else:
210
+ #pattern_lemma = [{"LEMMA": query[0]}]
211
+ pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
212
+
213
+ matcher = Matcher(nlp.vocab)
214
+ matcher.add(string_query, [pattern_fuzz])
215
+ #matcher.add(string_query, [pattern_lemma])
216
+
217
+ else:
218
+ # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
219
+ #tokenised_query = [string_query.lower()]
220
+ # If you want to match the whole phrase, use phrase matcher
221
+ matcher = FuzzyMatcher(nlp.vocab)
222
+ patterns = [nlp.make_doc(string_query)] # Convert query into a Doc object
223
+ matcher.add("PHRASE", patterns, [{"ignore_case": True}])
224
+
225
+ batch_size = 256
226
+ docs = nlp.pipe([text], batch_size=batch_size)
227
+
228
+ # Get number of matches per doc
229
+ for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
230
+ matches = matcher(doc)
231
+ match_count = len(matches)
232
+
233
+ # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
234
+ if search_whole_phrase==False:
235
+ all_matches.append(match_count)
236
+
237
+ for match_id, start, end in matches:
238
+ span = str(doc[start:end]).strip()
239
+ query_search = str(query).strip()
240
+ #print("doc:", doc)
241
+ #print("span:", span)
242
+ #print("query_search:", query_search)
243
+
244
+ # Convert word positions to character positions
245
+ start_char = doc[start].idx # Start character position
246
+ end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
247
+
248
+ # The positions here are word position, not character position
249
+ all_matches.append(match_count)
250
+ all_start_positions.append(start_char)
251
+ all_end_positions.append(end_char)
252
+
253
+ else:
254
+ for match_id, start, end, ratio, pattern in matches:
255
+ span = str(doc[start:end]).strip()
256
+ query_search = str(query).strip()
257
+ print("doc:", doc)
258
+ print("span:", span)
259
+ print("query_search:", query_search)
260
+
261
+ # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
262
+ distance = Levenshtein.distance(query_search.lower(), span.lower())
263
+
264
+ print("Levenshtein distance:", distance)
265
+
266
+ if distance > spelling_mistakes_max:
267
+ match_count = match_count - 1
268
+ else:
269
+ # Convert word positions to character positions
270
+ start_char = doc[start].idx # Start character position
271
+ end_char = doc[end - 1].idx + len(doc[end - 1]) # End character position
272
+
273
+ print("start_char:", start_char)
274
+ print("end_char:", end_char)
275
+
276
+ all_matches.append(match_count)
277
+ all_start_positions.append(start_char)
278
+ all_end_positions.append(end_char)
279
+ all_ratios.append(ratio)
280
+
281
+
282
+ return all_start_positions, all_end_positions
283
+
284
+
285
+ class CustomWordFuzzyRecognizer(EntityRecognizer):
286
+ def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
287
+ super().__init__(supported_entities=supported_entities)
288
+ self.custom_list = custom_list # Store the custom_list as an instance attribute
289
+ self.spelling_mistakes_max = spelling_mistakes_max # Store the max spelling mistakes
290
+ self.search_whole_phrase = search_whole_phrase # Store the search whole phrase flag
291
+
292
+ def load(self) -> None:
293
+ """No loading is required."""
294
+ pass
295
+
296
+ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
297
+ """
298
+ Logic for detecting a specific PII
299
+ """
300
+ start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase) # Pass new parameters
301
+
302
+ results = []
303
+
304
+ for i in range(0, len(start_pos)):
305
+ result = RecognizerResult(
306
+ entity_type="CUSTOM_FUZZY",
307
+ start=start_pos[i],
308
+ end=end_pos[i],
309
+ score=1
310
+ )
311
+ results.append(result)
312
+
313
+ return results
314
+
315
+ custom_list_default = []
316
+ custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
317
+
318
  # Create a class inheriting from SpacyNlpEngine
319
  class LoadedSpacyNlpEngine(SpacyNlpEngine):
320
  def __init__(self, loaded_spacy_model):
321
  super().__init__()
322
  self.nlp = {"en": loaded_spacy_model}
323
 
 
 
324
  # Pass the loaded model to the new LoadedSpacyNlpEngine
325
  loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
326
 
 
336
  nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
337
  nlp_analyser.registry.add_recognizer(titles_recogniser)
338
  nlp_analyser.registry.add_recognizer(custom_recogniser)
339
+ nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)
340
 
tools/redaction_review.py CHANGED
@@ -1,12 +1,14 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
 
 
 
4
  from typing import List
5
  from gradio_image_annotation import image_annotator
6
  from gradio_image_annotation.image_annotator import AnnotatedImageData
7
-
8
- from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
9
- from tools.helper_functions import get_file_path_end, output_folder
10
  from tools.file_redaction import redact_page_with_pymupdf
11
  import json
12
  import os
@@ -66,6 +68,12 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
66
  for image, items in image_groups.items():
67
  # Filter items with non-empty boxes
68
  non_empty_boxes = [item for item in items if item.get('boxes')]
 
 
 
 
 
 
69
  if non_empty_boxes:
70
  # Keep the first entry with non-empty boxes
71
  result.append(non_empty_boxes[0])
@@ -173,6 +181,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
173
 
174
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
175
 
 
 
176
  out_image_annotator = image_annotator(
177
  value = image_annotator_object[page_num_reported - 1],
178
  boxes_alpha=0.1,
@@ -262,7 +272,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
262
 
263
  for file_path in file_paths:
264
  #print("file_path:", file_path)
265
- file_name_without_ext = get_file_path_end(file_path)
266
  file_name_with_ext = os.path.basename(file_path)
267
 
268
  file_extension = os.path.splitext(file_path)[1].lower()
@@ -381,3 +391,365 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
381
  row_value_page = evt.row_value[0] # This is the page number value
382
  return row_value_page
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
+ from xml.etree.ElementTree import Element, SubElement, tostring, parse
5
+ from xml.dom import minidom
6
+ import uuid
7
  from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
+ from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
11
+ from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
 
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
14
  import os
 
68
  for image, items in image_groups.items():
69
  # Filter items with non-empty boxes
70
  non_empty_boxes = [item for item in items if item.get('boxes')]
71
+
72
+ # Remove 'text' elements from boxes
73
+ for item in non_empty_boxes:
74
+ if 'boxes' in item:
75
+ item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
76
+
77
  if non_empty_boxes:
78
  # Keep the first entry with non-empty boxes
79
  result.append(non_empty_boxes[0])
 
181
 
182
  image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
183
 
184
+
185
+
186
  out_image_annotator = image_annotator(
187
  value = image_annotator_object[page_num_reported - 1],
188
  boxes_alpha=0.1,
 
272
 
273
  for file_path in file_paths:
274
  #print("file_path:", file_path)
275
+ file_name_without_ext = get_file_name_without_type(file_path)
276
  file_name_with_ext = os.path.basename(file_path)
277
 
278
  file_extension = os.path.splitext(file_path)[1].lower()
 
391
  row_value_page = evt.row_value[0] # This is the page number value
392
  return row_value_page
393
 
394
+ def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
395
+ '''
396
+ Converts coordinates from image space to Adobe PDF space.
397
+
398
+ Parameters:
399
+ - pdf_page_width: Width of the PDF page
400
+ - pdf_page_height: Height of the PDF page
401
+ - image_width: Width of the source image
402
+ - image_height: Height of the source image
403
+ - x1, y1, x2, y2: Coordinates in image space
404
+
405
+ Returns:
406
+ - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
407
+ '''
408
+
409
+ # Calculate scaling factors
410
+ scale_width = pdf_page_width / image_width
411
+ scale_height = pdf_page_height / image_height
412
+
413
+ # Convert coordinates
414
+ pdf_x1 = x1 * scale_width
415
+ pdf_x2 = x2 * scale_width
416
+
417
+ # Convert Y coordinates (flip vertical axis)
418
+ # Adobe coordinates start from bottom-left
419
+ pdf_y1 = pdf_page_height - (y1 * scale_height)
420
+ pdf_y2 = pdf_page_height - (y2 * scale_height)
421
+
422
+ # Make sure y1 is always less than y2 for Adobe's coordinate system
423
+ if pdf_y1 > pdf_y2:
424
+ pdf_y1, pdf_y2 = pdf_y2, pdf_y1
425
+
426
+ return pdf_x1, pdf_y1, pdf_x2, pdf_y2
427
+
428
+
429
+ def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
430
+ '''
431
+ Create an xfdf file from a review csv file and a pdf
432
+ '''
433
+
434
+ # Create root element
435
+ xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
436
+
437
+ # Add header
438
+ header = SubElement(xfdf, 'header')
439
+ header.set('pdf-filepath', pdf_path)
440
+
441
+ # Add annots
442
+ annots = SubElement(xfdf, 'annots')
443
+
444
+ for _, row in df.iterrows():
445
+ page_python_format = int(row["page"])-1
446
+
447
+ pymupdf_page = pymupdf_doc.load_page(page_python_format)
448
+
449
+ pdf_page_height = pymupdf_page.rect.height
450
+ pdf_page_width = pymupdf_page.rect.width
451
+
452
+ image = image_paths[page_python_format]
453
+
454
+ #print("image:", image)
455
+
456
+ if isinstance(image, str):
457
+ image = Image.open(image)
458
+
459
+ image_page_width, image_page_height = image.size
460
+
461
+ # Create redaction annotation
462
+ redact_annot = SubElement(annots, 'redact')
463
+
464
+ # Generate unique ID
465
+ annot_id = str(uuid.uuid4())
466
+ redact_annot.set('name', annot_id)
467
+
468
+ # Set page number (subtract 1 as PDF pages are 0-based)
469
+ redact_annot.set('page', str(int(row['page']) - 1))
470
+
471
+ # Convert coordinates
472
+ x1, y1, x2, y2 = convert_image_coords_to_adobe(
473
+ pdf_page_width,
474
+ pdf_page_height,
475
+ image_page_width,
476
+ image_page_height,
477
+ row['xmin'],
478
+ row['ymin'],
479
+ row['xmax'],
480
+ row['ymax']
481
+ )
482
+
483
+ if CUSTOM_BOX_COLOUR == "grey":
484
+ colour_str = "0.5,0.5,0.5"
485
+ else:
486
+ colour_str = row['color'].strip('()').replace(' ', '')
487
+
488
+ # Set coordinates
489
+ redact_annot.set('rect', f"{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f}")
490
+
491
+ # Set redaction properties
492
+ redact_annot.set('title', row['label']) # The type of redaction (e.g., "PERSON")
493
+ redact_annot.set('contents', row['text']) # The redacted text
494
+ redact_annot.set('subject', row['label']) # The redacted text
495
+ redact_annot.set('mimetype', "Form")
496
+
497
+ # Set appearance properties
498
+ redact_annot.set('border-color', colour_str) # Black border
499
+ redact_annot.set('repeat', 'false')
500
+ redact_annot.set('interior-color', colour_str)
501
+ #redact_annot.set('fill-color', colour_str)
502
+ #redact_annot.set('outline-color', colour_str)
503
+ redact_annot.set('overlay-color', colour_str)
504
+ redact_annot.set('overlay-text', row['label'])
505
+ redact_annot.set('opacity', "0.5")
506
+
507
+ # Add appearance dictionary
508
+ # appearanceDict = SubElement(redact_annot, 'appearancedict')
509
+
510
+ # # Normal appearance
511
+ # normal = SubElement(appearanceDict, 'normal')
512
+ # #normal.set('appearance', 'redact')
513
+
514
+ # # Color settings for the mark (before applying redaction)
515
+ # markAppearance = SubElement(redact_annot, 'markappearance')
516
+ # markAppearance.set('stroke-color', colour_str) # Red outline
517
+ # markAppearance.set('fill-color', colour_str) # Light red fill
518
+ # markAppearance.set('opacity', '0.5') # 50% opacity
519
+
520
+ # # Final redaction appearance (after applying)
521
+ # redactAppearance = SubElement(redact_annot, 'redactAppearance')
522
+ # redactAppearance.set('fillColor', colour_str) # Black fill
523
+ # redactAppearance.set('fontName', 'Helvetica')
524
+ # redactAppearance.set('fontSize', '12')
525
+ # redactAppearance.set('textAlignment', 'left')
526
+ # redactAppearance.set('textColor', colour_str) # White text
527
+
528
+ # Convert to pretty XML string
529
+ xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
530
+
531
+ return xml_str
532
+
533
+ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
534
+ '''
535
+ Load in files to convert a review file into an Adobe comment file format
536
+ '''
537
+ output_paths = []
538
+ pdf_name = ""
539
+
540
+ if isinstance(input_files, str):
541
+ file_paths_list = [input_files]
542
+ else:
543
+ file_paths_list = input_files
544
+
545
+ # Sort the file paths so that the pdfs come first
546
+ file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
547
+
548
+ for file in file_paths_list:
549
+
550
+ if isinstance(file, str):
551
+ file_path = file
552
+ else:
553
+ file_path = file.name
554
+
555
+ file_path_name = get_file_name_without_type(file_path)
556
+ file_path_end = detect_file_type(file_path)
557
+
558
+ if file_path_end == "pdf":
559
+ pdf_name = os.path.basename(file_path)
560
+
561
+ if file_path_end == "csv":
562
+ # If no pdf name, just get the name of the file path
563
+ if not pdf_name:
564
+ pdf_name = file_path_name
565
+ # Read CSV file
566
+ df = pd.read_csv(file_path)
567
+
568
+ df.fillna('', inplace=True) # Replace NaN with an empty string
569
+
570
+ xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
571
+
572
+ output_path = output_folder + file_path_name + "_adobe.xfdf"
573
+
574
+ with open(output_path, 'w', encoding='utf-8') as f:
575
+ f.write(xfdf_content)
576
+
577
+ output_paths.append(output_path)
578
+
579
+ return output_paths
580
+
581
+
582
+ ### Convert xfdf coordinates back to image for app
583
+
584
+ def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
585
+ '''
586
+ Converts coordinates from Adobe PDF space to image space.
587
+
588
+ Parameters:
589
+ - pdf_page_width: Width of the PDF page
590
+ - pdf_page_height: Height of the PDF page
591
+ - image_width: Width of the source image
592
+ - image_height: Height of the source image
593
+ - x1, y1, x2, y2: Coordinates in Adobe PDF space
594
+
595
+ Returns:
596
+ - Tuple of converted coordinates (x1, y1, x2, y2) in image space
597
+ '''
598
+
599
+ # Calculate scaling factors
600
+ scale_width = image_width / pdf_page_width
601
+ scale_height = image_height / pdf_page_height
602
+
603
+ # Convert coordinates
604
+ image_x1 = x1 * scale_width
605
+ image_x2 = x2 * scale_width
606
+
607
+ # Convert Y coordinates (flip vertical axis)
608
+ # Adobe coordinates start from bottom-left
609
+ image_y1 = (pdf_page_height - y1) * scale_height
610
+ image_y2 = (pdf_page_height - y2) * scale_height
611
+
612
+ # Make sure y1 is always less than y2 for image's coordinate system
613
+ if image_y1 > image_y2:
614
+ image_y1, image_y2 = image_y2, image_y1
615
+
616
+ return image_x1, image_y1, image_x2, image_y2
617
+
618
+ def parse_xfdf(xfdf_path):
619
+ '''
620
+ Parse the XFDF file and extract redaction annotations.
621
+
622
+ Parameters:
623
+ - xfdf_path: Path to the XFDF file
624
+
625
+ Returns:
626
+ - List of dictionaries containing redaction information
627
+ '''
628
+ tree = parse(xfdf_path)
629
+ root = tree.getroot()
630
+
631
+ # Define the namespace
632
+ namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
633
+
634
+ redactions = []
635
+
636
+ # Find all redact elements using the namespace
637
+ for redact in root.findall('.//xfdf:redact', namespaces=namespace):
638
+
639
+ #print("redact:", redact)
640
+
641
+ redaction_info = {
642
+ 'image': '', # Image will be filled in later
643
+ 'page': int(redact.get('page')) + 1, # Convert to 1-based index
644
+ 'xmin': float(redact.get('rect').split(',')[0]),
645
+ 'ymin': float(redact.get('rect').split(',')[1]),
646
+ 'xmax': float(redact.get('rect').split(',')[2]),
647
+ 'ymax': float(redact.get('rect').split(',')[3]),
648
+ 'label': redact.get('title'),
649
+ 'text': redact.get('contents'),
650
+ 'color': redact.get('border-color', '(0, 0, 0)') # Default to black if not specified
651
+ }
652
+ redactions.append(redaction_info)
653
+
654
+ print("redactions:", redactions)
655
+
656
+ return redactions
657
+
658
+ def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
659
+ '''
660
+ Convert redaction annotations from XFDF and associated images into a DataFrame.
661
+
662
+ Parameters:
663
+ - xfdf_path: Path to the XFDF file
664
+ - pdf_doc: PyMuPDF document object
665
+ - image_paths: List of PIL Image objects corresponding to PDF pages
666
+
667
+ Returns:
668
+ - DataFrame containing redaction information
669
+ '''
670
+ output_paths = []
671
+ xfdf_paths = []
672
+ df = pd.DataFrame()
673
+
674
+ #print("Image paths:", image_paths)
675
+
676
+ # Sort the file paths so that the pdfs come first
677
+ file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
678
+
679
+ for file in file_paths_list:
680
+
681
+ if isinstance(file, str):
682
+ file_path = file
683
+ else:
684
+ file_path = file.name
685
+
686
+ file_path_name = get_file_name_without_type(file_path)
687
+ file_path_end = detect_file_type(file_path)
688
+
689
+ if file_path_end == "pdf":
690
+ pdf_name = os.path.basename(file_path)
691
+ #print("pymupdf_doc:", pymupdf_doc)
692
+
693
+ # Add pdf to outputs
694
+ output_paths.append(file_path)
695
+
696
+ if file_path_end == "xfdf":
697
+
698
+ if not pdf_name:
699
+ message = "Original PDF needed to convert from .xfdf format"
700
+ print(message)
701
+ raise ValueError(message)
702
+
703
+ xfdf_path = file
704
+
705
+ # if isinstance(xfdf_paths, str):
706
+ # xfdf_path = xfdf_paths.name
707
+ # else:
708
+ # xfdf_path = xfdf_paths[0].name
709
+
710
+ file_path_name = get_file_name_without_type(xfdf_path)
711
+
712
+ #print("file_path_name:", file_path_name)
713
+
714
+ # Parse the XFDF file
715
+ redactions = parse_xfdf(xfdf_path)
716
+
717
+ # Create a DataFrame from the redaction information
718
+ df = pd.DataFrame(redactions)
719
+
720
+ df.fillna('', inplace=True) # Replace NaN with an empty string
721
+
722
+ for _, row in df.iterrows():
723
+ page_python_format = int(row["page"])-1
724
+
725
+ pymupdf_page = pymupdf_doc.load_page(page_python_format)
726
+
727
+ pdf_page_height = pymupdf_page.rect.height
728
+ pdf_page_width = pymupdf_page.rect.width
729
+
730
+ image_path = image_paths[page_python_format]
731
+
732
+ #print("image_path:", image_path)
733
+
734
+ if isinstance(image_path, str):
735
+ image = Image.open(image_path)
736
+
737
+ image_page_width, image_page_height = image.size
738
+
739
+ # Convert to image coordinates
740
+ image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_page_width, image_page_height, row['xmin'], row['ymin'], row['xmax'], row['ymax'])
741
+
742
+ df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
743
+
744
+ # Optionally, you can add the image path or other relevant information
745
+ #print("Image path:", image_path)
746
+ df.loc[_, 'image'] = image_path
747
+
748
+ #print('row:', row)
749
+
750
+ out_file_path = output_folder + file_path_name + "_review_file.csv"
751
+ df.to_csv(out_file_path, index=None)
752
+
753
+ output_paths.append(out_file_path)
754
+
755
+ return output_paths