Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

Sean Pedrick-Case commited on Jan 27

Commit

d84e0a9

unverified ·

2 Parent(s): 8b4217f b397d1d

Merge pull request #6 from seanpedrick-case/dev

Browse files

Export to Adobe, fuzzy matching, and duplicate page identification

Files changed (11) hide show

Dockerfile +3 -0
app.py +85 -25
requirements.txt +4 -0
tools/custom_image_analyser_engine.py +42 -11
tools/data_anonymise.py +3 -3
tools/file_conversion.py +10 -6
tools/file_redaction.py +99 -236
tools/find_duplicate_pages.py +274 -0
tools/helper_functions.py +66 -20
tools/load_spacy_model_custom_recognisers.py +176 -25
tools/redaction_review.py +376 -4

Dockerfile CHANGED Viewed

@@ -60,6 +60,9 @@ RUN mkdir -p /home/user/app/output \
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 # Entrypoint helps to switch between Gradio and Lambda mode
 COPY entrypoint.sh /entrypoint.sh

 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
+# Download NLTK data packages
+RUN python -m nltk.downloader punkt stopwords punkt_tab
 # Entrypoint helps to switch between Gradio and Lambda mode
 COPY entrypoint.sh /entrypoint.sh

app.py CHANGED Viewed

@@ -10,15 +10,16 @@ from datetime import datetime
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
-from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 today_rev = datetime.now().strftime("%Y%m%d")
@@ -29,15 +30,16 @@ ensure_output_folder_exists()
 chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
-full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER']
 # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
 chosen_comprehend_entities.extend(custom_entities)
 full_comprehend_entity_list.extend(custom_entities)
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
-full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM']
 language = 'en'
@@ -67,10 +69,9 @@ with app:
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
-    all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
-    review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
@@ -129,16 +130,16 @@ with app:
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
     default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
-    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
-    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
-    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
@@ -149,6 +150,12 @@ with app:
     # Base dataframe for recognisers that is not modified subsequent to load
     recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
     ###
     # UI DESIGN
     ###
@@ -164,8 +171,10 @@ with app:
     NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
-    # PDF / IMAGES TAB
-    with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             if RUN_AWS_FUNCTIONS == "1":
@@ -194,7 +203,9 @@ with app:
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
-    # Object annotation
     with gr.Tab("Review redactions", id="tab_object_annotation"):
         with gr.Accordion(label = "Review redaction file", open=True):
@@ -215,7 +226,6 @@ with app:
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
         with gr.Row():
             with gr.Column(scale=1):
                 zoom_str = str(annotator_zoom_number) + '%'
@@ -247,10 +257,16 @@ with app:
         #with gr.Column(scale=1):
         with gr.Row():
             recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
-            recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
     """
@@ -280,7 +296,20 @@ with app:
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     # SETTINGS TAB
     with gr.Tab(label="Redaction settings"):
         with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
             with gr.Row():
@@ -296,9 +325,12 @@ with app:
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
                 in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
         with gr.Accordion("Redact only selected pages", open = False):
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
@@ -312,21 +344,30 @@ with app:
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
-        log_files_output = gr.File(label="Log file output", interactive=False)
     ###
     # PDF/IMAGE REDACTION
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
-    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
-    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
@@ -339,7 +380,8 @@ with app:
     ###
     # Upload previous files for modifying redactions
-    upload_previous_review_file_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
@@ -397,7 +439,16 @@ with app:
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     ###
     # TABULAR DATA REDACTION
     ###
@@ -410,13 +461,22 @@ with app:
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # SETTINGS PAGE INPUT / OUTPUT
     ###
-    # If a custom allow list is uploaded
     in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
     ###

 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
+from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
+from tools.find_duplicate_pages import identify_similar_pages
 today_rev = datetime.now().strftime("%Y%m%d")
 chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD', 'IP_ADDRESS','MAC_ADDRESS', 'LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER', 'INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER']
+full_comprehend_entity_list = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER','CREDIT_DEBIT_CVV','CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS','NAME','PHONE','SSN','DATE_TIME','PASSPORT_NUMBER','DRIVER_ID','URL','AGE','USERNAME','PASSWORD','AWS_ACCESS_KEY','AWS_SECRET_KEY','IP_ADDRESS','MAC_ADDRESS','ALL','LICENSE_PLATE','VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER','CA_SOCIAL_INSURANCE_NUMBER','US_INDIVIDUAL_TAX_IDENTIFICATION_NUMBER','UK_UNIQUE_TAXPAYER_REFERENCE_NUMBER','IN_PERMANENT_ACCOUNT_NUMBER','IN_NREGA','INTERNATIONAL_BANK_ACCOUNT_NUMBER','SWIFT_CODE','UK_NATIONAL_HEALTH_SERVICE_NUMBER','CA_HEALTH_NUMBER','IN_AADHAAR','IN_VOTER_NUMBER', "CUSTOM_FUZZY"]
 # Add custom spacy recognisers to the Comprehend list, so that local Spacy model can be used to pick up e.g. titles, streetnames, UK postcodes that are sometimes missed by comprehend
 chosen_comprehend_entities.extend(custom_entities)
 full_comprehend_entity_list.extend(custom_entities)
+# Entities for local PII redaction option
 chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", "CUSTOM"]
+full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS', 'CUSTOM', 'CUSTOM_FUZZY']
 language = 'en'
     pdf_doc_state = gr.State([])
     all_image_annotations_state = gr.State([])
+    all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
+    review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
     default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
+    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=False, type="pandas")
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
+    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
+    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=False, type="pandas")
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
     # Base dataframe for recognisers that is not modified subsequent to load
     recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
+    # Duplicate page detection
+    in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
+    duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
     ###
     # UI DESIGN
     ###
     NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
+    ###
+    # REDACTION PDF/IMAGES TABL
+    ###
+    with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             if RUN_AWS_FUNCTIONS == "1":
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+    ###
+    # REVIEW REDACTIONS TAB
+    ###
     with gr.Tab("Review redactions", id="tab_object_annotation"):
         with gr.Accordion(label = "Review redaction file", open=True):
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
         with gr.Row():
             with gr.Column(scale=1):
                 zoom_str = str(annotator_zoom_number) + '%'
         #with gr.Column(scale=1):
         with gr.Row():
             recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
+            recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
+        with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
+            convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
+            adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
+            convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
+    ###
     # TEXT / TABULAR DATA TAB
+    ###
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
     """
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+    ###
+    # IDENTIFY DUPLICATE PAGES TAB
+    ###
+    with gr.Tab(label="Identify duplicate pages"):
+        with gr.Accordion("Identify duplicate pages to redact", open = True):
+            in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
+            find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
+            duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
+    ###
     # SETTINGS TAB
+    ###
     with gr.Tab(label="Redaction settings"):
         with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
             with gr.Row():
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
                 in_redact_comprehend_entities = gr.Dropdown(value=chosen_comprehend_entities, choices=full_comprehend_entity_list, multiselect=True, label="AWS Comprehend PII identification model (click empty space in box for full list)")
+                with gr.Row():
+                    max_fuzzy_spelling_mistakes_num = gr.Number(label="Maximum number of spelling mistakes allowed for fuzzy matching (CUSTOM_FUZZY entity).", value=1, minimum=0, maximum=9, precision=0)
+                    match_fuzzy_whole_phrase_bool = gr.Checkbox(label="Should fuzzy match on entire phrases in deny list (as opposed to each word individually)?", value=True)
         with gr.Accordion("Redact only selected pages", open = False):
             with gr.Row():
                 page_min = gr.Number(precision=0,minimum=0,maximum=9999, label="Lowest page to redact")
         with gr.Accordion("Settings for open text or xlsx/csv files", open = False):
             anon_strat = gr.Radio(choices=["replace with <REDACTED>", "replace with <ENTITY_NAME>", "redact", "hash", "mask", "encrypt", "fake_first_name"], label="Select an anonymisation method.", value = "replace with <REDACTED>")
+        log_files_output = gr.File(label="Log file output", interactive=False)
+        with gr.Accordion("Combine multiple review files", open = False):
+            multiple_review_files_in_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv'])
+            merge_multiple_review_files_btn = gr.Button("Merge multiple review files into one", variant="primary")
+    ### UI INTERACTION ###
     ###
     # PDF/IMAGE REDACTION
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
+    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
+    then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If the app has completed a batch of pages, it will run this until the end of all pages in the document
+    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files]).\
                     then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     ###
     # Upload previous files for modifying redactions
+    upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
+        then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
         then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
         then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
+    # Convert review file to xfdf Adobe format
+    convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
+        then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state], outputs=[adobe_review_files_out])
+    # Convert xfdf Adobe file back to review_file.csv
+    convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
+        then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state], outputs=[output_review_files], scroll_to_output=True)
     ###
     # TABULAR DATA REDACTION
     ###
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
+    ###
+    # IDENTIFY DUPLICATE PAGES
+    ###
+    find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages], outputs=[duplicate_pages_df, duplicate_pages_out])
     ###
     # SETTINGS PAGE INPUT / OUTPUT
     ###
+    # If a custom allow/deny/duplicate page list is uploaded
     in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
+    # Merge multiple review csv files together
+    merge_multiple_review_files_btn.click(fn=merge_csv_files, inputs=multiple_review_files_in_out, outputs=multiple_review_files_in_out)
     ###

requirements.txt CHANGED Viewed

@@ -7,6 +7,8 @@ presidio_anonymizer==2.2.355
 presidio-image-redactor==0.0.53
 pikepdf==8.15.1
 pandas==2.2.3
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
@@ -15,6 +17,8 @@ boto3==1.35.83
 pyarrow==18.1.0
 openpyxl==3.1.2
 Faker==22.2.0
 gradio_image_annotation==0.2.5
 numpy==1.26.4
 awslambdaric==3.0.0

 presidio-image-redactor==0.0.53
 pikepdf==8.15.1
 pandas==2.2.3
+nltk==3.9.1
+scikit-learn==1.5.2
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 pyarrow==18.1.0
 openpyxl==3.1.2
 Faker==22.2.0
+python-levenshtein==0.26.1
+spaczz==0.6.1
 gradio_image_annotation==0.2.5
 numpy==1.26.4
 awslambdaric==3.0.0

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -560,7 +560,7 @@ def run_page_text_redaction(
         if not nlp_analyser:
             raise ValueError("nlp_analyser is required for Local identification method")
-        print("page text:", page_text)
         page_analyser_result = nlp_analyser.analyze(
             text=page_text,
@@ -1077,15 +1077,15 @@ class CustomImageAnalyzerEngine:
             line_length = len(line_text)
             redaction_text = redaction_relevant_ocr_result.text
-            # print(f"Processing line: '{line_text}'")
             for redaction_result in text_analyzer_results:
-                # print(f"Checking redaction result: {redaction_result}")
-                # print("redaction_text:", redaction_text)
-                # print("line_length:", line_length)
-                # print("line_text:", line_text)
-                # Check if the redaction text is no in the allow list
                 if redaction_text not in allow_list:
@@ -1098,14 +1098,45 @@ class CustomImageAnalyzerEngine:
                     matched_words = matched_text.split()
                     # print(f"Found match: '{matched_text}' in line")
                     # Find the corresponding words in the OCR results
                     matching_word_boxes = []
                     for word_info in ocr_results_with_children_child_info.get('words', []):
-                        # Check if this word is part of our match
-                        if any(word.lower() in word_info['text'].lower() for word in matched_words):
                             matching_word_boxes.append(word_info['bounding_box'])
-                            # print(f"Matched word: {word_info['text']}")
                     if matching_word_boxes:
                         # Calculate the combined bounding box for all matching words
@@ -1127,7 +1158,7 @@ class CustomImageAnalyzerEngine:
                                 text=matched_text
                             )
                         )
-                        # print(f"Added bounding box for: '{matched_text}'")
         return redaction_bboxes

         if not nlp_analyser:
             raise ValueError("nlp_analyser is required for Local identification method")
+        #print("page text:", page_text)
         page_analyser_result = nlp_analyser.analyze(
             text=page_text,
             line_length = len(line_text)
             redaction_text = redaction_relevant_ocr_result.text
+            #print(f"Processing line: '{line_text}'")
             for redaction_result in text_analyzer_results:
+                #print(f"Checking redaction result: {redaction_result}")
+                #print("redaction_text:", redaction_text)
+                #print("line_length:", line_length)
+                #print("line_text:", line_text)
+                # Check if the redaction text is not in the allow list
                 if redaction_text not in allow_list:
                     matched_words = matched_text.split()
                     # print(f"Found match: '{matched_text}' in line")
+                    # for word_info in ocr_results_with_children_child_info.get('words', []):
+                    #     # Check if this word is part of our match
+                    #     if any(word.lower() in word_info['text'].lower() for word in matched_words):
+                    #         matching_word_boxes.append(word_info['bounding_box'])
+                    #         print(f"Matched word: {word_info['text']}")
                     # Find the corresponding words in the OCR results
                     matching_word_boxes = []
+                    #print("ocr_results_with_children_child_info:", ocr_results_with_children_child_info)
+                    current_position = 0
                     for word_info in ocr_results_with_children_child_info.get('words', []):
+                        word_text = word_info['text']
+                        word_length = len(word_text)
+                        # Assign start and end character positions
+                        #word_info['start_position'] = current_position
+                        #word_info['end_position'] = current_position + word_length
+                        word_start = current_position
+                        word_end = current_position + word_length
+                        # Update current position for the next word
+                        current_position += word_length + 1  # +1 for the space after the word
+                        #print("word_info['bounding_box']:", word_info['bounding_box'])
+                        #print("word_start:", word_start)
+                        #print("start_in_line:", start_in_line)
+                        #print("word_end:", word_end)
+                        #print("end_in_line:", end_in_line)
+                        # Check if the word's bounding box is within the start and end bounds
+                        if word_start >= start_in_line and word_end <= (end_in_line + 1):
                             matching_word_boxes.append(word_info['bounding_box'])
+                            #print(f"Matched word: {word_info['text']}")
                     if matching_word_boxes:
                         # Calculate the combined bounding box for all matching words
                                 text=matched_text
                             )
                         )
+                        #print(f"Added bounding box for: '{matched_text}'")
         return redaction_bboxes

tools/data_anonymise.py CHANGED Viewed

@@ -12,7 +12,7 @@ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerR
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
-from tools.helper_functions import output_folder, get_file_path_end, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 # Use custom version of analyze_dict to be able to track progress
@@ -434,7 +434,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
             file_type = detect_file_type(anon_file)
             print("File type is:", file_type)
-            out_file_part = get_file_path_end(anon_file.name)
             if file_type == 'xlsx':
                 print("Running through all xlsx sheets")
@@ -472,7 +472,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
-                out_file_part = get_file_path_end(anon_file.name)
                 out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         # Increase latest file completed count unless we are at the last file

 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
+from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 # Use custom version of analyze_dict to be able to track progress
             file_type = detect_file_type(anon_file)
             print("File type is:", file_type)
+            out_file_part = get_file_name_without_type(anon_file.name)
             if file_type == 'xlsx':
                 print("Running through all xlsx sheets")
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
+                out_file_part = get_file_name_without_type(anon_file.name)
                 out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         # Increase latest file completed count unless we are at the last file

tools/file_conversion.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from pdf2image import convert_from_path, pdfinfo_from_path
-from tools.helper_functions import get_file_path_end, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
 from PIL import Image, ImageFile
 import os
 import re
@@ -7,6 +7,7 @@ import time
 import json
 import pymupdf
 import pandas as pd
 from pymupdf import Rect
 from fitz import Page
 from tqdm import tqdm
@@ -240,7 +241,7 @@ def get_input_file_names(file_input:List[str]):
         else:
             file_path = file.name
-        file_path_without_ext = get_file_path_end(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
@@ -489,7 +490,7 @@ def prepare_image_or_pdf(
             file_path = file
         else:
             file_path = file.name
-        file_path_without_ext = get_file_path_end(file_path)
         file_name_with_ext = os.path.basename(file_path)
         if not file_path:
@@ -668,7 +669,7 @@ def prepare_image_or_pdf(
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
-    file_path_without_ext = get_file_path_end(in_file_path)
     out_file_paths = out_text_file_path
@@ -754,7 +755,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
             if 'text' not in box:
                 data_to_add = {"image": image_path, "page": reported_number,  **box} # "text": annotation['text'],
             else:
-                data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
             #print("data_to_add:", data_to_add)
             flattened_annotation_data.append(data_to_add)
@@ -764,7 +765,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
     #print("redaction_decision_output:", redaction_decision_output)
     #print("annotation_data_as_df:", annotation_data_as_df)
-    # Join on additional text data from decision output results if included
     if not redaction_decision_output.empty:
         #print("redaction_decision_output is not empty")
         #print("redaction_decision_output:", redaction_decision_output)
@@ -793,6 +794,9 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
         if col not in annotation_data_as_df.columns:
             annotation_data_as_df[col] = ''
     annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
     return annotation_data_as_df

 from pdf2image import convert_from_path, pdfinfo_from_path
+from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
 from PIL import Image, ImageFile
 import os
 import re
 import json
 import pymupdf
 import pandas as pd
+import numpy as np
 from pymupdf import Rect
 from fitz import Page
 from tqdm import tqdm
         else:
             file_path = file.name
+        file_path_without_ext = get_file_name_without_type(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
             file_path = file
         else:
             file_path = file.name
+        file_path_without_ext = get_file_name_without_type(file_path)
         file_name_with_ext = os.path.basename(file_path)
         if not file_path:
     return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
+    file_path_without_ext = get_file_name_without_type(in_file_path)
     out_file_paths = out_text_file_path
             if 'text' not in box:
                 data_to_add = {"image": image_path, "page": reported_number,  **box} # "text": annotation['text'],
             else:
+                data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
             #print("data_to_add:", data_to_add)
             flattened_annotation_data.append(data_to_add)
     #print("redaction_decision_output:", redaction_decision_output)
     #print("annotation_data_as_df:", annotation_data_as_df)
+    # Join on additional text data from decision output results if included, if text not already there
     if not redaction_decision_output.empty:
         #print("redaction_decision_output is not empty")
         #print("redaction_decision_output:", redaction_decision_output)
         if col not in annotation_data_as_df.columns:
             annotation_data_as_df[col] = ''
+    for col in ['xmin', 'xmax', 'ymin', 'ymax']:
+        annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
     annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
     return annotation_data_as_df

tools/file_redaction.py CHANGED Viewed

@@ -27,8 +27,8 @@ from presidio_analyzer import RecognizerResult
 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
-from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser
-from tools.helper_functions import get_file_path_end, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
@@ -94,6 +94,8 @@ def choose_and_run_redactor(file_paths:List[str],
  page_break_return:bool=False,
  pii_identification_method:str="Local",
  comprehend_query_number:int=0,
  output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
@@ -127,6 +129,8 @@ def choose_and_run_redactor(file_paths:List[str],
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
     - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -136,7 +140,7 @@ def choose_and_run_redactor(file_paths:List[str],
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
-    print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
@@ -279,9 +283,9 @@ def choose_and_run_redactor(file_paths:List[str],
             file_path = file.name
         if file_path:
-            pdf_file_name_without_ext = get_file_path_end(file_path)
             pdf_file_name_with_ext = os.path.basename(file_path)
-            print("Redacting file:", pdf_file_name_with_ext)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
@@ -327,7 +331,9 @@ def choose_and_run_redactor(file_paths:List[str],
              comprehend_client,
              textract_client,
              custom_recogniser_word_list,
-             redact_whole_page_list)
             #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
@@ -366,7 +372,9 @@ def choose_and_run_redactor(file_paths:List[str],
             comprehend_query_number,
             comprehend_client,
             custom_recogniser_word_list,
-            redact_whole_page_list)
         else:
             out_message = "No redaction method selected"
@@ -414,13 +422,7 @@ def choose_and_run_redactor(file_paths:List[str],
             # Save the gradio_annotation_boxes to a JSON file
             try:
-                #print("Saving annotations to JSON")
-                out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
-                with open(out_annotation_file_path, 'w') as f:
-                    json.dump(annotations_all_pages, f)
-                log_files_output_paths.append(out_annotation_file_path)
                 #print("Saving annotations to CSV")
                 # Convert json to csv and also save this
@@ -435,6 +437,13 @@ def choose_and_run_redactor(file_paths:List[str],
                 print("Saved review file to csv")
             except Exception as e:
                 print("Could not save annotations to json or csv file:", e)
@@ -694,10 +703,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                 x1 = pymupdf_x1
                 x2 = pymupdf_x2
-                # if hasattr(annot, 'text') and annot.text:
-                #     img_annotation_box["text"] = annot.text
-                # else:
-                #     img_annotation_box["text"] = ""
             # Else should be CustomImageRecognizerResult
             else:
@@ -715,10 +724,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                     img_annotation_box["label"] = annot.entity_type
                 except:
                     img_annotation_box["label"] = "Redaction"
-                # if hasattr(annot, 'text') and annot.text:
-                #     img_annotation_box["text"] = annot.text
-                # else:
-                #     img_annotation_box["text"] = ""
             rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)  # Create the PyMuPDF Rect
@@ -749,12 +759,14 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                 if isinstance(annot, Dictionary):
                     img_annotation_box["label"] = str(annot["/T"])
                 else:
                     img_annotation_box["label"] = "REDACTION"
-                # if hasattr(annot, 'text') and annot.text:
-                #     img_annotation_box["text"] = annot.text
-                # else:
-                #     img_annotation_box["text"] = ""
         # Convert to a PyMuPDF Rect object
         #rect = Rect(rect_coordinates)
@@ -779,6 +791,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
     return page, out_annotation_boxes
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     all_bboxes = []
@@ -908,6 +925,8 @@ def redact_image_pdf(file_path:str,
                      textract_client:str="",
                      custom_recogniser_word_list:List[str]=[],
                      redact_whole_page_list:List[str]=[],
                      page_break_val:int=int(page_break_value),
                      log_files_output_paths:List=[],
                      max_time:int=int(max_time_value),
@@ -940,14 +959,16 @@ def redact_image_pdf(file_path:str,
     - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
     - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
-    The function returns a fully or partially-redacted PDF document.
     '''
-    file_name = get_file_path_end(file_path)
     fill = (0, 0, 0)   # Fill colour for redactions
     comprehend_query_number_new = 0
@@ -957,11 +978,14 @@ def redact_image_pdf(file_path:str,
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         #print("new_custom_recogniser:", new_custom_recogniser)
-        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
-    image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service unsuccessful.")
@@ -1051,7 +1075,7 @@ def redact_image_pdf(file_path:str,
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
-                print("image is a file path", image)
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
@@ -1119,7 +1143,7 @@ def redact_image_pdf(file_path:str,
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
             # Step 2: Analyze text and identify PII
-            if chosen_redact_entities:
                 redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
                     line_level_ocr_results,
@@ -1185,6 +1209,7 @@ def redact_image_pdf(file_path:str,
             ## Apply annotations with pymupdf
             else:
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
@@ -1309,7 +1334,7 @@ def redact_image_pdf(file_path:str,
 ###
-# PIKEPDF TEXT PDF REDACTION
 ###
 def get_text_container_characters(text_container:LTTextContainer):
@@ -1466,6 +1491,8 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
 def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
     pikepdf_annotations_on_page = []
     for analysed_bounding_box in analysed_bounding_boxes:
         bounding_box = analysed_bounding_box["boundingBox"]
         annotation = Dictionary(
             Type=Name.Annot,
@@ -1477,6 +1504,7 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
             IC=[0, 0, 0],
             CA=1, # Transparency
             T=analysed_bounding_box["result"].entity_type,
             BS=Dictionary(
                 W=0,                     # Border width: 1 point
                 S=Name.S                # Border style: solid
@@ -1485,182 +1513,6 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
         pikepdf_annotations_on_page.append(annotation)
     return pikepdf_annotations_on_page
-# def run_page_text_redaction(language: str,  # Language of the PDF content
-#     chosen_redact_entities: List[str],  # List of entities to be redacted
-#     chosen_redact_comprehend_entities: List[str],
-#     line_level_text_results_list: List[str],
-#     line_characters: List,
-#     page_analyser_results: List = [],
-#     page_analysed_bounding_boxes: List = [],
-#     comprehend_client = None, # Connection to AWS Comprehend
-#     allow_list: List[str] = None,  # Optional list of allowed entities
-#     pii_identification_method: str = "Local"
-#     ):
-#     # Initialize batching variables
-#     current_batch = ""
-#     current_batch_mapping = []  # List of (start_pos, line_index, OCRResult) tuples
-#     all_text_line_results = []  # Store results for all lines
-#     text_container_analyser_results = []
-#     text_container_analysed_bounding_boxes = []
-#     # First pass: collect all lines into batches
-#     for i, text_line in enumerate(line_level_text_results_list):
-#         if chosen_redact_entities:
-#             if pii_identification_method == "Local":
-#                 #print("chosen_redact_entities:", chosen_redact_entities)
-#                 # Process immediately for local analysis
-#                 text_line_analyser_result = nlp_analyser.analyze(
-#                     text=text_line.text,
-#                     language=language,
-#                     entities=chosen_redact_entities,
-#                     score_threshold=score_threshold,
-#                     return_decision_process=True,
-#                     allow_list=allow_list
-#                 )
-#                 all_text_line_results.append((i, text_line_analyser_result))
-#             elif pii_identification_method == "AWS Comprehend":
-#                 # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
-#                 custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
-#                 text_line_analyser_result = nlp_analyser.analyze(
-#                     text=text_line.text,
-#                     language=language,
-#                     entities=custom_redact_entities,
-#                     score_threshold=score_threshold,
-#                     return_decision_process=True,
-#                     allow_list=allow_list
-#                 )
-#                 all_text_line_results.append((i, text_line_analyser_result))
-#                 if len(text_line.text) >= 3:
-#                     # Add separator between lines
-#                     if current_batch:
-#                         current_batch += " | "
-#                     start_pos = len(current_batch)
-#                     current_batch += text_line.text
-#                     current_batch_mapping.append((start_pos, i, text_line))
-#                     # Process batch if approaching 300 characters or last line
-#                     if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
-#                         print("length of text for Comprehend:", len(current_batch))
-#                         try:
-#                             response = comprehend_client.detect_pii_entities(
-#                                 Text=current_batch,
-#                                 LanguageCode=language
-#                             )
-#                         except Exception as e:
-#                             print(e)
-#                             time.sleep(3)
-#                             response = comprehend_client.detect_pii_entities(
-#                                 Text=current_batch,
-#                                 LanguageCode=language
-#                             )
-#                         comprehend_query_number += 1
-#                         # Process response and map back to original lines
-#                         if response and "Entities" in response:
-#                             for entity in response["Entities"]:
-#                                 entity_start = entity["BeginOffset"]
-#                                 entity_end = entity["EndOffset"]
-#                                 # Find which line this entity belongs to
-#                                 for batch_start, line_idx, original_line in current_batch_mapping:
-#                                     batch_end = batch_start + len(original_line.text)
-#                                     # Check if entity belongs to this line
-#                                     if batch_start <= entity_start < batch_end:
-#                                         # Adjust offsets relative to original line
-#                                         relative_start = entity_start - batch_start
-#                                         relative_end = min(entity_end - batch_start, len(original_line.text))
-#                                         result_text = original_line.text[relative_start:relative_end]
-#                                         if result_text not in allow_list:
-#                                             if entity.get("Type") in chosen_redact_comprehend_entities:
-#                                                 # Create adjusted entity
-#                                                 adjusted_entity = entity.copy()
-#                                                 adjusted_entity["BeginOffset"] = relative_start
-#                                                 adjusted_entity["EndOffset"] = relative_end
-#                                                 recogniser_entity = recognizer_result_from_dict(adjusted_entity)
-#                                                 # Add to results for this line
-#                                                 existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
-#                                                 if not existing_results:
-#                                                     all_text_line_results.append((line_idx, [recogniser_entity]))
-#                                                 else:
-#                                                     existing_results.append(recogniser_entity)
-#                         # Reset batch
-#                         current_batch = ""
-#                         current_batch_mapping = []
-#     # Second pass: process results for each line
-#     for i, text_line in enumerate(line_level_text_results_list):
-#         text_line_analyser_result = []
-#         text_line_bounding_boxes = []
-#         # Get results for this line
-#         line_results = next((results for idx, results in all_text_line_results if idx == i), [])
-#         if line_results:
-#             text_line_analyser_result = line_results
-#             #print("Analysed text container, now merging bounding boxes")
-#             # Merge bounding boxes if very close together
-#             text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
-#             #print("merged bounding boxes")
-#             text_container_analyser_results.extend(text_line_analyser_result)
-#             #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
-#             #print("text_container_analyser_results:", text_container_analyser_results)
-#             page_analyser_results.extend(text_container_analyser_results)  # Add this line
-#             page_analysed_bounding_boxes.extend(text_line_bounding_boxes)  # Add this line
-#     return page_analysed_bounding_boxes
-# def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
-#     for entity in page_analyser_result:
-#         entity_start = entity.start
-#         entity_end = entity.end
-#         for batch_start, line_idx, original_line, chars in page_text_mapping:
-#             batch_end = batch_start + len(original_line.text)
-#             if batch_start <= entity_start < batch_end:
-#                 relative_start = entity_start - batch_start
-#                 relative_end = min(entity_end - batch_start, len(original_line.text))
-#                 adjusted_entity = copy.deepcopy(entity)
-#                 adjusted_entity.start = relative_start
-#                 adjusted_entity.end = relative_end
-#                 existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
-#                 if existing_entry is None:
-#                     all_text_line_results.append((line_idx, [adjusted_entity]))
-#                 else:
-#                     existing_entry.append(adjusted_entity)
-#                 break
-#     return all_text_line_results
 def redact_text_pdf(
     filename: str,  # Path to the PDF file to be redacted
     prepared_pdf_image_path: str,  # Path to the prepared PDF image for redaction
@@ -1682,6 +1534,8 @@ def redact_text_pdf(
     comprehend_client="",
     custom_recogniser_word_list:List[str]=[],
     redact_whole_page_list:List[str]=[],
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
@@ -1711,6 +1565,8 @@ def redact_text_pdf(
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
@@ -1726,9 +1582,12 @@ def redact_text_pdf(
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
-        #print("new_custom_recogniser:", new_custom_recogniser)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
         # List all elements currently in the nlp_analyser registry
         #print("Current recognizers in nlp_analyser registry:")
         #for recognizer_name in nlp_analyser.registry.recognizers:
@@ -1761,15 +1620,14 @@ def redact_text_pdf(
     for page_no in progress_bar:
         reported_page_number = str(page_no + 1)
-        print("Redacting page:", reported_page_number)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
             image = prepared_pdf_image_path[page_no]#.copy()
             #print("image:", image)
         except Exception as e:
-            print("Could not redact page:", reported_page_number, "due to:")
-            print(e)
             continue
         image_annotations = {"image": image, "boxes": []}
@@ -1825,27 +1683,32 @@ def redact_text_pdf(
                     ### REDACTION
-                    page_analysed_bounding_boxes = run_page_text_redaction(
-                                                        language,
-                                                        chosen_redact_entities,
-                                                        chosen_redact_comprehend_entities,
-                                                        all_line_level_text_results_list, #line_level_text_results_list,
-                                                        all_line_characters,
-                                                        page_analyser_results,
-                                                        page_analysed_bounding_boxes,
-                                                        comprehend_client,
-                                                        allow_list,
-                                                        pii_identification_method,
-                                                        nlp_analyser,
-                                                        score_threshold,
-                                                        custom_entities,
-                                                        comprehend_query_number
-                                                        )
-                #print("page_analyser_results:", page_analyser_results)
-                #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
-                #print("image:", image)
                 page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
@@ -1854,7 +1717,7 @@ def redact_text_pdf(
                 # Annotate redactions on page
                 pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
-                #print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
                 # Make pymupdf page redactions
                 #print("redact_whole_page_list:", redact_whole_page_list)

 from tools.aws_functions import RUN_AWS_FUNCTIONS
 from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
 from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
+from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
+from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
 from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
  page_break_return:bool=False,
  pii_identification_method:str="Local",
  comprehend_query_number:int=0,
+ max_fuzzy_spelling_mistakes_num:int=1,
+ match_fuzzy_whole_phrase_bool:bool=True,
  output_folder:str=output_folder,
  progress=gr.Progress(track_tqdm=True)):
     '''
     - page_break_return (bool, optional): A flag indicating if the function should return after a page break. Defaults to False.
     - pii_identification_method (str, optional): The method to redact personal information. Either 'Local' (spacy model), or 'AWS Comprehend' (AWS Comprehend API).
     - comprehend_query_number (int, optional): A counter tracking the number of queries to AWS Comprehend.
+    -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - output_folder (str, optional): Output folder for results.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
+    #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
             file_path = file.name
         if file_path:
+            pdf_file_name_without_ext = get_file_name_without_type(file_path)
             pdf_file_name_with_ext = os.path.basename(file_path)
+            # print("Redacting file:", pdf_file_name_with_ext)
             is_a_pdf = is_pdf(file_path) == True
             if is_a_pdf == False and in_redact_method == text_ocr_option:
              comprehend_client,
              textract_client,
              custom_recogniser_word_list,
+             redact_whole_page_list,
+             max_fuzzy_spelling_mistakes_num,
+             match_fuzzy_whole_phrase_bool)
             #print("log_files_output_paths at end of image redact function:", log_files_output_paths)
             comprehend_query_number,
             comprehend_client,
             custom_recogniser_word_list,
+            redact_whole_page_list,
+            max_fuzzy_spelling_mistakes_num,
+            match_fuzzy_whole_phrase_bool)
         else:
             out_message = "No redaction method selected"
             # Save the gradio_annotation_boxes to a JSON file
             try:
                 #print("Saving annotations to CSV")
                 # Convert json to csv and also save this
                 print("Saved review file to csv")
+                out_annotation_file_path = out_orig_pdf_file_path + '_review_file.json'
+                with open(out_annotation_file_path, 'w') as f:
+                    json.dump(annotations_all_pages, f)
+                log_files_output_paths.append(out_annotation_file_path)
+                print("Saving annotations to JSON")
             except Exception as e:
                 print("Could not save annotations to json or csv file:", e)
                 x1 = pymupdf_x1
                 x2 = pymupdf_x2
+                if hasattr(annot, 'text') and annot.text:
+                    img_annotation_box["text"] = annot.text
+                else:
+                    img_annotation_box["text"] = ""
             # Else should be CustomImageRecognizerResult
             else:
                     img_annotation_box["label"] = annot.entity_type
                 except:
                     img_annotation_box["label"] = "Redaction"
+                if hasattr(annot, 'text') and annot.text:
+                    img_annotation_box["text"] = annot.text
+                else:
+                    img_annotation_box["text"] = ""
             rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)  # Create the PyMuPDF Rect
                 if isinstance(annot, Dictionary):
                     img_annotation_box["label"] = str(annot["/T"])
+                    if hasattr(annot, 'Contents'):
+                        img_annotation_box["text"] = annot.Contents
+                    else:
+                        img_annotation_box["text"] = ""
                 else:
                     img_annotation_box["label"] = "REDACTION"
+                    img_annotation_box["text"] = ""
         # Convert to a PyMuPDF Rect object
         #rect = Rect(rect_coordinates)
     return page, out_annotation_boxes
+###
+# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
+###
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     all_bboxes = []
                      textract_client:str="",
                      custom_recogniser_word_list:List[str]=[],
                      redact_whole_page_list:List[str]=[],
+                     max_fuzzy_spelling_mistakes_num:int=1,
+                     match_fuzzy_whole_phrase_bool:bool=True,
                      page_break_val:int=int(page_break_value),
                      log_files_output_paths:List=[],
                      max_time:int=int(max_time_value),
     - textract_client (optional): A connection to the AWS Textract service via the boto3 package.
     - custom_recogniser_word_list (optional): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
+    - max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to 3.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress (Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
+    The function returns a redacted PDF document along with processing output objects.
     '''
+    file_name = get_file_name_without_type(file_path)
     fill = (0, 0, 0)   # Fill colour for redactions
     comprehend_query_number_new = 0
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         #print("new_custom_recogniser:", new_custom_recogniser)
+        nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+        nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
+        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
+        #print("new_custom_recogniser:", new_custom_recogniser)
+        nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
+    image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service unsuccessful.")
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
+                #print("image is a file path", image)
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
             # Step 2: Analyze text and identify PII
+            if chosen_redact_entities or chosen_redact_comprehend_entities:
                 redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
                     line_level_ocr_results,
             ## Apply annotations with pymupdf
             else:
+                print("merged_redaction_boxes:", merged_redaction_bboxes)
                 #print("redact_whole_page_list:", redact_whole_page_list)
                 if redact_whole_page_list:
                     int_reported_page_number = int(reported_page_number)
 ###
+# PIKEPDF TEXT DETECTION/REDACTION
 ###
 def get_text_container_characters(text_container:LTTextContainer):
 def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
     pikepdf_annotations_on_page = []
     for analysed_bounding_box in analysed_bounding_boxes:
+        #print("analysed_bounding_box:", analysed_bounding_boxes)
         bounding_box = analysed_bounding_box["boundingBox"]
         annotation = Dictionary(
             Type=Name.Annot,
             IC=[0, 0, 0],
             CA=1, # Transparency
             T=analysed_bounding_box["result"].entity_type,
+            Contents=analysed_bounding_box["text"],
             BS=Dictionary(
                 W=0,                     # Border width: 1 point
                 S=Name.S                # Border style: solid
         pikepdf_annotations_on_page.append(annotation)
     return pikepdf_annotations_on_page
 def redact_text_pdf(
     filename: str,  # Path to the PDF file to be redacted
     prepared_pdf_image_path: str,  # Path to the prepared PDF image for redaction
     comprehend_client="",
     custom_recogniser_word_list:List[str]=[],
     redact_whole_page_list:List[str]=[],
+    max_fuzzy_spelling_mistakes_num:int=1,
+    match_fuzzy_whole_phrase_bool:bool=True,
     page_break_val: int = int(page_break_value),  # Value for page break
     max_time: int = int(max_time_value),
     progress: Progress = Progress(track_tqdm=True)  # Progress tracking object
     - comprehend_client (optional): A connection to the AWS Comprehend service via the boto3 package.
     - custom_recogniser_word_list (optional, List[str]): A list of custom words that the user has chosen specifically to redact.
     - redact_whole_page_list (optional, List[str]): A list of pages to fully redact.
+    -  max_fuzzy_spelling_mistakes_num (int, optional): The maximum number of spelling mistakes allowed in a searched phrase for fuzzy matching. Can range from 0-9.
+    -  match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - page_break_val: Value for page break
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     if custom_recogniser_word_list:
         nlp_analyser.registry.remove_recognizer("CUSTOM")
         new_custom_recogniser = custom_word_list_recogniser(custom_recogniser_word_list)
         nlp_analyser.registry.add_recognizer(new_custom_recogniser)
+        nlp_analyser.registry.remove_recognizer("CUSTOM_FUZZY")
+        new_custom_fuzzy_recogniser = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_recogniser_word_list, spelling_mistakes_max=max_fuzzy_spelling_mistakes_num, search_whole_phrase=match_fuzzy_whole_phrase_bool)
+        nlp_analyser.registry.add_recognizer(new_custom_fuzzy_recogniser)
         # List all elements currently in the nlp_analyser registry
         #print("Current recognizers in nlp_analyser registry:")
         #for recognizer_name in nlp_analyser.registry.recognizers:
     for page_no in progress_bar:
         reported_page_number = str(page_no + 1)
+        #print("Redacting page:", reported_page_number)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
             image = prepared_pdf_image_path[page_no]#.copy()
             #print("image:", image)
         except Exception as e:
+            print("Could not redact page:", reported_page_number, "due to:", e)
             continue
         image_annotations = {"image": image, "boxes": []}
                     ### REDACTION
+                    if chosen_redact_entities or chosen_redact_comprehend_entities:
+                        #print("Identifying redactions on page.")
+                        page_analysed_bounding_boxes = run_page_text_redaction(
+                                                            language,
+                                                            chosen_redact_entities,
+                                                            chosen_redact_comprehend_entities,
+                                                            all_line_level_text_results_list,
+                                                            all_line_characters,
+                                                            page_analyser_results,
+                                                            page_analysed_bounding_boxes,
+                                                            comprehend_client,
+                                                            allow_list,
+                                                            pii_identification_method,
+                                                            nlp_analyser,
+                                                            score_threshold,
+                                                            custom_entities,
+                                                            comprehend_query_number
+                                                            )
+                    #print("page_analyser_results:", page_analyser_results)
+                    #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
+                    #print("image:", image)
+                    else:
+                        page_analysed_bounding_boxes = []
                 page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
                 # Annotate redactions on page
                 pikepdf_annotations_on_page = create_pikepdf_annotations_for_bounding_boxes(page_analysed_bounding_boxes)
+                # print("pikepdf_annotations_on_page:", pikepdf_annotations_on_page)
                 # Make pymupdf page redactions
                 #print("redact_whole_page_list:", redact_whole_page_list)

tools/find_duplicate_pages.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import pandas as pd
+import argparse
+import glob
+import os
+import re
+from tools.helper_functions import output_folder
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+import numpy as np
+import random
+import string
+from typing import List
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('punkt_tab')
+similarity_threshold = 0.9
+stop_words = set(stopwords.words('english'))
+# List of words to remove from the stopword set
+#words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]
+# Remove the specified words from the stopwords set
+#for word in words_to_remove:
+#    stop_words.discard(word.lower())
+stemmer = PorterStemmer()
+vectorizer = TfidfVectorizer()
+def combine_ocr_output_text(input_files):
+    """
+    Combines text from multiple CSV files containing page and text columns.
+    Groups text by file and page number, concatenating text within these groups.
+    Args:
+        input_files (list): List of paths to CSV files
+    Returns:
+        pd.DataFrame: Combined dataframe with columns [file, page, text]
+    """
+    all_data = []
+    output_files = []
+    if isinstance(input_files, str):
+        file_paths_list = [input_files]
+    else:
+        file_paths_list = input_files
+    for file in file_paths_list:
+        if isinstance(file, str):
+            file_path = file
+        else:
+            file_path = file.name
+        # Read CSV file
+        df = pd.read_csv(file_path)
+        # Ensure required columns exist
+        if 'page' not in df.columns or 'text' not in df.columns:
+            print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
+            continue
+        # Group by page and concatenate text
+        grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
+        # Add filename column
+        grouped['file'] = os.path.basename(file_path)
+        all_data.append(grouped)
+    if not all_data:
+        raise ValueError("No valid CSV files were processed")
+    # Combine all dataframes
+    combined_df = pd.concat(all_data, ignore_index=True)
+    # Reorder columns
+    combined_df = combined_df[['file', 'page', 'text']]
+    output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
+    combined_df.to_csv(output_combined_file_path, index=None)
+    output_files.append(output_combined_file_path)
+    return combined_df, output_files
+def process_data(df, column:str):
+    '''
+    Clean and stem text columns in a data frame
+    '''
+    def _clean_text(raw_text):
+        # Remove HTML tags
+        clean = re.sub(r'<.*?>', '', raw_text)
+        clean = re.sub(r'&nbsp;', ' ', clean)
+        clean = re.sub(r'\r\n', ' ', clean)
+        clean = re.sub(r'&lt;', ' ', clean)
+        clean = re.sub(r'&gt;', ' ', clean)
+        clean = re.sub(r'<strong>', ' ', clean)
+        clean = re.sub(r'</strong>', ' ', clean)
+        # Replace non-breaking space \xa0 with a space
+        clean = clean.replace(u'\xa0', u' ')
+        # Remove extra whitespace
+        clean = ' '.join(clean.split())
+        # Tokenize the text
+        words = word_tokenize(clean.lower())
+        # Remove punctuation and numbers
+        words = [word for word in words if word.isalpha()]
+        # Remove stopwords
+        words = [word for word in words if word not in stop_words]
+        # Join the cleaned words back into a string
+        return ' '.join(words)
+    # Function to apply stemming
+    def _apply_stemming(text):
+        # Tokenize the text
+        words = word_tokenize(text.lower())
+        # Apply stemming to each word
+        stemmed_words = [stemmer.stem(word) for word in words]
+        # Join the stemmed words back into a single string
+        return ' '.join(stemmed_words)
+    df['text_clean'] = df[column].apply(_clean_text)
+    df['text_clean'] = df['text_clean'].apply(_apply_stemming)
+    return df
+def identify_similar_pages(input_files:List[str]):
+    output_paths = []
+    df, output_files = combine_ocr_output_text(input_files)
+    output_paths.extend(output_files)
+    # Clean text
+    df = process_data(df, 'text')
+    # Vectorise text
+    tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
+    # Calculate cosine similarity
+    similarity_matrix = cosine_similarity(tfidf_matrix)
+    # Find the indices of the most similar pages
+    np.fill_diagonal(similarity_matrix, 0)  # Ignore self-comparisons
+    similar_pages = np.argwhere(similarity_matrix > similarity_threshold)  # Threshold of similarity
+    #print(similar_pages)
+    # Create a DataFrame for similar pairs and their scores
+    similarity_df = pd.DataFrame({
+        'Page1_Index': similar_pages[:, 0],
+        'Page2_Index': similar_pages[:, 1],
+        'Page1_File': similar_pages[:, 0],
+        'Page2_File': similar_pages[:, 1],
+        'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
+    })
+    # Filter out duplicate pairs (keep only one direction)
+    similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
+    # Map the indices to their corresponding text and metadata
+    similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
+    similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])
+    similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
+    similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])
+    similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
+    similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])
+    similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
+    similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
+    # Save detailed results to a CSV file
+    similarity_file_output_path = output_folder + 'page_similarity_results.csv'
+    similarity_df_out.to_csv(similarity_file_output_path, index=False)
+    output_paths.append(similarity_file_output_path)
+    if not similarity_df_out.empty:
+        unique_files = similarity_df_out['Page2_File'].unique()
+        for redact_file in unique_files:
+            output_file_name = output_folder + redact_file + "_whole_page.csv"
+            whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
+            whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)
+            output_paths.append(output_file_name)
+    return similarity_df_out, output_paths
+# Perturb text
+# Apply the perturbation function with a 10% error probability
+def perturb_text_with_errors(series):
+    def _perturb_text(text, error_probability=0.1):
+        words = text.split()  # Split text into words
+        perturbed_words = []
+        for word in words:
+            if random.random() < error_probability:  # Add a random error
+                perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
+                if perturbation_type == 'char_error':  # Introduce a character error
+                    idx = random.randint(0, len(word) - 1)
+                    char = random.choice(string.ascii_lowercase)  # Add a random letter
+                    word = word[:idx] + char + word[idx:]
+                elif perturbation_type == 'extra_space':  # Add extra space around a word
+                    word = ' ' + word + ' '
+                elif perturbation_type == 'extra_punctuation':  # Add punctuation to the word
+                    punctuation = random.choice(string.punctuation)
+                    idx = random.randint(0, len(word))  # Insert punctuation randomly
+                    word = word[:idx] + punctuation + word[idx:]
+            perturbed_words.append(word)
+        return ' '.join(perturbed_words)
+    series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
+    return series
+# Run through command line
+# def main():
+#     parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
+#     parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
+#     parser.add_argument('--output', '-o', default='combined_text.csv',
+#                        help='Output CSV file path (default: combined_text.csv)')
+#     args = parser.parse_args()
+#     # Get list of input files
+#     input_files = glob.glob(args.input_pattern)
+#     if not input_files:
+#         print(f"No files found matching pattern: {args.input_pattern}")
+#         return
+#     print(f"Processing {len(input_files)} files...")
+#     try:
+#         # Combine the text from all files
+#         combined_df = combine_ocr_output_text(input_files)
+#         # Save to CSV
+#         combined_df.to_csv(args.output, index=False)
+#         print(f"Successfully created combined output: {args.output}")
+#         print(f"Total pages processed: {len(combined_df)}")
+#     except Exception as e:
+#         print(f"Error processing files: {str(e)}")
+# if __name__ == "__main__":
+#     main()

tools/helper_functions.py CHANGED Viewed

@@ -4,23 +4,12 @@ import boto3
 from botocore.exceptions import ClientError
 import gradio as gr
 import pandas as pd
 import unicodedata
 from typing import List
 from gradio_image_annotation import image_annotator
 from tools.auth import user_pool_id
-def reset_state_vars():
-    return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
-            label="Modify redaction boxes",
-            label_list=["Redaction"],
-            label_colors=[(0, 0, 0)],
-            show_label=False,
-            sources=None,#["upload"],
-            show_clear_button=False,
-            show_share_button=False,
-            show_remove_button=False,
-            interactive=False
-        ), [], []
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
@@ -48,13 +37,40 @@ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
 input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
 print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
-def get_file_path_end(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
@@ -81,6 +97,8 @@ def detect_file_type(filename):
         return 'jpeg'
     elif filename.endswith('.png'):
         return 'png'
     else:
         raise ValueError("Unsupported file type.")
@@ -121,7 +139,7 @@ def custom_regex_load(in_file:List[str], file_type:str = "Allow list"):
         if regex_file_names:
             regex_file_name = regex_file_names[0]
             custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
-            #regex_file_name_no_ext = get_file_path_end(regex_file_name)
             custom_regex.columns = custom_regex.columns.astype(str)
@@ -215,13 +233,41 @@ def wipe_logs(feedback_logs_loc, usage_logs_loc):
     except Exception as e:
         print("Could not remove usage logs file", e)
-# Retrieving or setting CUSTOM_HEADER
-CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
-print(f'CUSTOM_HEADER found')
-# Retrieving or setting CUSTOM_HEADER_VALUE
-CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
-print(f'CUSTOM_HEADER_VALUE found')
 async def get_connection_params(request: gr.Request):
     base_folder = ""

 from botocore.exceptions import ClientError
 import gradio as gr
 import pandas as pd
+import numpy as np
 import unicodedata
 from typing import List
 from gradio_image_annotation import image_annotator
 from tools.auth import user_pool_id
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists
 input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
 print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
+# Retrieving or setting CUSTOM_HEADER
+CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
+print(f'CUSTOM_HEADER found')
+# Retrieving or setting CUSTOM_HEADER_VALUE
+CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
+print(f'CUSTOM_HEADER_VALUE found')
+def reset_state_vars():
+    return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
+            label="Modify redaction boxes",
+            label_list=["Redaction"],
+            label_colors=[(0, 0, 0)],
+            show_label=False,
+            sources=None,#["upload"],
+            show_clear_button=False,
+            show_share_button=False,
+            show_remove_button=False,
+            interactive=False
+        ), [], [], [], pd.DataFrame(), pd.DataFrame()
+def reset_review_vars():
+    return [], pd.DataFrame(), pd.DataFrame()
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):
         allow_list_file_path = [allow_list_file_path]
     return allow_list_file_path
+def get_file_name_without_type(file_path):
     # First, get the basename of the file (e.g., "example.txt" from "/path/to/example.txt")
     basename = os.path.basename(file_path)
         return 'jpeg'
     elif filename.endswith('.png'):
         return 'png'
+    elif filename.endswith('.xfdf'):
+        return 'xfdf'
     else:
         raise ValueError("Unsupported file type.")
         if regex_file_names:
             regex_file_name = regex_file_names[0]
             custom_regex = pd.read_csv(regex_file_name, low_memory=False, header=None)
+            #regex_file_name_no_ext = get_file_name_without_type(regex_file_name)
             custom_regex.columns = custom_regex.columns.astype(str)
     except Exception as e:
         print("Could not remove usage logs file", e)
+def merge_csv_files(file_list):
+    # Initialise an empty list to hold DataFrames
+    dataframes = []
+    output_files = []
+    # Loop through each file in the file list
+    for file in file_list:
+        # Read the CSV file into a DataFrame
+        df = pd.read_csv(file.name)
+        dataframes.append(df)
+    # Concatenate all DataFrames into a single DataFrame
+    merged_df = pd.concat(dataframes, ignore_index=True)
+    for col in ['xmin', 'xmax', 'ymin', 'ymax']:
+        merged_df[col] = np.floor(merged_df[col])
+    merged_df = merged_df.drop_duplicates(subset=['page', 'label', 'color', 'xmin', 'ymin', 'xmax', 'ymax'])
+    merged_df = merged_df.sort_values(['page', 'ymin', 'xmin', 'label'])
+    file_out_name = os.path.basename(file_list[0])
+    merged_csv_path = output_folder + file_out_name + "_merged.csv"
+    # Save the merged DataFrame to a CSV file
+    #merged_csv = StringIO()
+    merged_df.to_csv(merged_csv_path, index=False)
+    output_files.append(merged_csv_path)
+    #merged_csv.seek(0)  # Move to the beginning of the StringIO object
+    return output_files
 async def get_connection_params(request: gr.Request):
     base_folder = ""

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -3,9 +3,13 @@ from typing import List
 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 import spacy
 spacy.prefer_gpu()
 from spacy.cli.download import download
 import re
 model_name = "en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
@@ -65,16 +69,8 @@ ukpostcode_pattern = Pattern(
 # Define the recognizer with one or more patterns
 ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
-# %%
-# Examples for testing
-#text = "I live in 510 Broad st SE5 9NG ."
-#numbers_result = ukpostcode_recogniser.analyze(text=text, entities=["UKPOSTCODE"])
-#print("Result:")
-#print(numbers_result)
-# %%
 def extract_street_name(text:str) -> str:
     """
     Extracts the street name and preceding word (that should contain at least one number) from the given text.
@@ -101,7 +97,7 @@ def extract_street_name(text:str) -> str:
     pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
     # Find all matches in text
-    matches = re.finditer(pattern, text, re.IGNORECASE)
     start_positions = []
     end_positions = []
@@ -120,19 +116,6 @@ def extract_street_name(text:str) -> str:
     return start_positions, end_positions
-# %%
-# Some examples for testing
-#text = "1234 Main Street, 5678 Oak Rd, 9ABC Elm Blvd, 42 Eagle st."
-#text = "Roberto lives in Five 10 Broad st in Oregon"
-#text = "Roberto lives in 55 Oregon Square"
-#text = "There is 51a no way I will do that"
-#text = "I am writing to apply for"
-#extract_street_name(text)
-# %%
 class StreetNameRecognizer(EntityRecognizer):
     def load(self) -> None:
@@ -163,14 +146,181 @@ class StreetNameRecognizer(EntityRecognizer):
 street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
     def __init__(self, loaded_spacy_model):
         super().__init__()
         self.nlp = {"en": loaded_spacy_model}
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
@@ -186,4 +336,5 @@ nlp_analyser.registry.add_recognizer(street_recogniser)
 nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
 nlp_analyser.registry.add_recognizer(titles_recogniser)
 nlp_analyser.registry.add_recognizer(custom_recogniser)

 from presidio_analyzer import AnalyzerEngine, PatternRecognizer, EntityRecognizer, Pattern, RecognizerResult
 from presidio_analyzer.nlp_engine import SpacyNlpEngine, NlpArtifacts
 import spacy
+from spacy.matcher import Matcher, PhraseMatcher
+from spaczz.matcher import FuzzyMatcher
 spacy.prefer_gpu()
 from spacy.cli.download import download
+import Levenshtein
 import re
+import gradio as gr
 model_name = "en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 # Define the recognizer with one or more patterns
 ukpostcode_recogniser = PatternRecognizer(supported_entity="UKPOSTCODE", name = "UKPOSTCODE", patterns = [ukpostcode_pattern])
+### Street name
 def extract_street_name(text:str) -> str:
     """
     Extracts the street name and preceding word (that should contain at least one number) from the given text.
     pattern += rf'(?P<street_name>\w+\s*\b(?:{street_types_pattern})\b)'
     # Find all matches in text
+    matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
     start_positions = []
     end_positions = []
     return start_positions, end_positions
 class StreetNameRecognizer(EntityRecognizer):
     def load(self) -> None:
 street_recogniser = StreetNameRecognizer(supported_entities=["STREETNAME"])
+## Custom fuzzy match recogniser for list of strings
+def custom_fuzzy_word_list_regex(text:str, custom_list:List[str]=[]):
+    # Create regex pattern, handling quotes carefully
+    quote_str = '"'
+    replace_str = '(?:"|"|")'
+    custom_regex_pattern = '|'.join(
+        rf'(?<!\w){re.escape(term.strip()).replace(quote_str, replace_str)}(?!\w)'
+        for term in custom_list
+    )
+    # Find all matches in text
+    matches = re.finditer(custom_regex_pattern, text, re.DOTALL | re.MULTILINE | re.IGNORECASE)
+    start_positions = []
+    end_positions = []
+    for match in matches:
+        start_pos = match.start()
+        end_pos = match.end()
+        start_positions.append(start_pos)
+        end_positions.append(end_pos)
+    return start_positions, end_positions
+def spacy_fuzzy_search(text: str, custom_query_list:List[str]=[], spelling_mistakes_max:int = 1, search_whole_phrase:bool=True, nlp=nlp, progress=gr.Progress(track_tqdm=True)):
+    ''' Conduct fuzzy match on a list of text data.'''
+    all_matches = []
+    all_start_positions = []
+    all_end_positions = []
+    all_ratios = []
+    #print("custom_query_list:", custom_query_list)
+    if not text:
+        out_message = "Prepared data not found. Have you clicked 'Load data' above to prepare a search index?"
+        print(out_message)
+        return out_message, None
+    for string_query in custom_query_list:
+        #print("text:", text)
+        #print("string_query:", string_query)
+        query = nlp(string_query)
+        if search_whole_phrase == False:
+            # Keep only words that are not stop words
+            token_query = [token.text for token in query if not token.is_space and not token.is_stop and not token.is_punct]
+            spelling_mistakes_fuzzy_pattern = "FUZZY" + str(spelling_mistakes_max)
+            #print("token_query:", token_query)
+            if len(token_query) > 1:
+                #pattern_lemma = [{"LEMMA": {"IN": query}}]
+                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: {"IN": token_query}}}]
+            else:
+                #pattern_lemma = [{"LEMMA": query[0]}]
+                pattern_fuzz = [{"TEXT": {spelling_mistakes_fuzzy_pattern: token_query[0]}}]
+            matcher = Matcher(nlp.vocab)
+            matcher.add(string_query, [pattern_fuzz])
+            #matcher.add(string_query, [pattern_lemma])
+        else:
+            # If matching a whole phrase, use Spacy PhraseMatcher, then consider similarity after using Levenshtein distance.
+            #tokenised_query = [string_query.lower()]
+            # If you want to match the whole phrase, use phrase matcher
+            matcher = FuzzyMatcher(nlp.vocab)
+            patterns = [nlp.make_doc(string_query)]  # Convert query into a Doc object
+            matcher.add("PHRASE", patterns, [{"ignore_case": True}])
+        batch_size = 256
+        docs = nlp.pipe([text], batch_size=batch_size)
+        # Get number of matches per doc
+        for doc in docs: #progress.tqdm(docs, desc = "Searching text", unit = "rows"):
+            matches = matcher(doc)
+            match_count = len(matches)
+            # If considering each sub term individually, append match. If considering together, consider weight of the relevance to that of the whole phrase.
+            if search_whole_phrase==False:
+                all_matches.append(match_count)
+                for match_id, start, end in matches:
+                    span = str(doc[start:end]).strip()
+                    query_search = str(query).strip()
+                    #print("doc:", doc)
+                    #print("span:", span)
+                    #print("query_search:", query_search)
+                    # Convert word positions to character positions
+                    start_char = doc[start].idx  # Start character position
+                    end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
+                    # The positions here are word position, not character position
+                    all_matches.append(match_count)
+                    all_start_positions.append(start_char)
+                    all_end_positions.append(end_char)
+            else:
+                for match_id, start, end, ratio, pattern in matches:
+                    span = str(doc[start:end]).strip()
+                    query_search = str(query).strip()
+                    print("doc:", doc)
+                    print("span:", span)
+                    print("query_search:", query_search)
+                    # Calculate Levenshtein distance. Only keep matches with less than specified number of spelling mistakes
+                    distance = Levenshtein.distance(query_search.lower(), span.lower())
+                    print("Levenshtein distance:", distance)
+                    if distance > spelling_mistakes_max:
+                        match_count = match_count - 1
+                    else:
+                        # Convert word positions to character positions
+                        start_char = doc[start].idx  # Start character position
+                        end_char = doc[end - 1].idx + len(doc[end - 1])  # End character position
+                        print("start_char:", start_char)
+                        print("end_char:", end_char)
+                        all_matches.append(match_count)
+                        all_start_positions.append(start_char)
+                        all_end_positions.append(end_char)
+                        all_ratios.append(ratio)
+    return all_start_positions, all_end_positions
+class CustomWordFuzzyRecognizer(EntityRecognizer):
+    def __init__(self, supported_entities: List[str], custom_list: List[str] = [], spelling_mistakes_max: int = 1, search_whole_phrase: bool = True):
+        super().__init__(supported_entities=supported_entities)
+        self.custom_list = custom_list  # Store the custom_list as an instance attribute
+        self.spelling_mistakes_max = spelling_mistakes_max  # Store the max spelling mistakes
+        self.search_whole_phrase = search_whole_phrase  # Store the search whole phrase flag
+    def load(self) -> None:
+        """No loading is required."""
+        pass
+    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
+        """
+        Logic for detecting a specific PII
+        """
+        start_pos, end_pos = spacy_fuzzy_search(text, self.custom_list, self.spelling_mistakes_max, self.search_whole_phrase)  # Pass new parameters
+        results = []
+        for i in range(0, len(start_pos)):
+            result = RecognizerResult(
+                entity_type="CUSTOM_FUZZY",
+                start=start_pos[i],
+                end=end_pos[i],
+                score=1
+            )
+            results.append(result)
+        return results
+custom_list_default = []
+custom_word_fuzzy_recognizer = CustomWordFuzzyRecognizer(supported_entities=["CUSTOM_FUZZY"], custom_list=custom_list_default)
 # Create a class inheriting from SpacyNlpEngine
 class LoadedSpacyNlpEngine(SpacyNlpEngine):
     def __init__(self, loaded_spacy_model):
         super().__init__()
         self.nlp = {"en": loaded_spacy_model}
 # Pass the loaded model to the new LoadedSpacyNlpEngine
 loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model = nlp)
 nlp_analyser.registry.add_recognizer(ukpostcode_recogniser)
 nlp_analyser.registry.add_recognizer(titles_recogniser)
 nlp_analyser.registry.add_recognizer(custom_recogniser)
+nlp_analyser.registry.add_recognizer(custom_word_fuzzy_recognizer)

tools/redaction_review.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import gradio as gr
 import pandas as pd
 import numpy as np
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
-from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_redaction import redact_page_with_pymupdf
 import json
 import os
@@ -66,6 +68,12 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
     for image, items in image_groups.items():
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
         if non_empty_boxes:
             # Keep the first entry with non-empty boxes
             result.append(non_empty_boxes[0])
@@ -173,6 +181,8 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
@@ -262,7 +272,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
     for file_path in file_paths:
         #print("file_path:", file_path)
-        file_name_without_ext = get_file_path_end(file_path)
         file_name_with_ext = os.path.basename(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
@@ -381,3 +391,365 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page

 import gradio as gr
 import pandas as pd
 import numpy as np
+from xml.etree.ElementTree import Element, SubElement, tostring, parse
+from xml.dom import minidom
+import uuid
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
+from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
 import json
 import os
     for image, items in image_groups.items():
         # Filter items with non-empty boxes
         non_empty_boxes = [item for item in items if item.get('boxes')]
+         # Remove 'text' elements from boxes
+        for item in non_empty_boxes:
+            if 'boxes' in item:
+                item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
         if non_empty_boxes:
             # Keep the first entry with non-empty boxes
             result.append(non_empty_boxes[0])
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
     for file_path in file_paths:
         #print("file_path:", file_path)
+        file_name_without_ext = get_file_name_without_type(file_path)
         file_name_with_ext = os.path.basename(file_path)
         file_extension = os.path.splitext(file_path)[1].lower()
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
+def convert_image_coords_to_adobe(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
+    '''
+    Converts coordinates from image space to Adobe PDF space.
+    Parameters:
+    - pdf_page_width: Width of the PDF page
+    - pdf_page_height: Height of the PDF page
+    - image_width: Width of the source image
+    - image_height: Height of the source image
+    - x1, y1, x2, y2: Coordinates in image space
+    Returns:
+    - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
+    '''
+    # Calculate scaling factors
+    scale_width = pdf_page_width / image_width
+    scale_height = pdf_page_height / image_height
+    # Convert coordinates
+    pdf_x1 = x1 * scale_width
+    pdf_x2 = x2 * scale_width
+    # Convert Y coordinates (flip vertical axis)
+    # Adobe coordinates start from bottom-left
+    pdf_y1 = pdf_page_height - (y1 * scale_height)
+    pdf_y2 = pdf_page_height - (y2 * scale_height)
+    # Make sure y1 is always less than y2 for Adobe's coordinate system
+    if pdf_y1 > pdf_y2:
+        pdf_y1, pdf_y2 = pdf_y2, pdf_y1
+    return pdf_x1, pdf_y1, pdf_x2, pdf_y2
+def create_xfdf(df, pdf_path, pymupdf_doc, image_paths):
+    '''
+    Create an xfdf file from a review csv file and a pdf
+    '''
+    # Create root element
+    xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
+    # Add header
+    header = SubElement(xfdf, 'header')
+    header.set('pdf-filepath', pdf_path)
+    # Add annots
+    annots = SubElement(xfdf, 'annots')
+    for _, row in df.iterrows():
+        page_python_format = int(row["page"])-1
+        pymupdf_page = pymupdf_doc.load_page(page_python_format)
+        pdf_page_height = pymupdf_page.rect.height
+        pdf_page_width = pymupdf_page.rect.width
+        image = image_paths[page_python_format]
+        #print("image:", image)
+        if isinstance(image, str):
+            image = Image.open(image)
+        image_page_width, image_page_height = image.size
+        # Create redaction annotation
+        redact_annot = SubElement(annots, 'redact')
+        # Generate unique ID
+        annot_id = str(uuid.uuid4())
+        redact_annot.set('name', annot_id)
+        # Set page number (subtract 1 as PDF pages are 0-based)
+        redact_annot.set('page', str(int(row['page']) - 1))
+        # Convert coordinates
+        x1, y1, x2, y2 = convert_image_coords_to_adobe(
+            pdf_page_width,
+            pdf_page_height,
+            image_page_width,
+            image_page_height,
+            row['xmin'],
+            row['ymin'],
+            row['xmax'],
+            row['ymax']
+        )
+        if CUSTOM_BOX_COLOUR == "grey":
+            colour_str = "0.5,0.5,0.5"
+        else:
+            colour_str = row['color'].strip('()').replace(' ', '')
+        # Set coordinates
+        redact_annot.set('rect', f"{x1:.2f},{y1:.2f},{x2:.2f},{y2:.2f}")
+        # Set redaction properties
+        redact_annot.set('title', row['label'])  # The type of redaction (e.g., "PERSON")
+        redact_annot.set('contents', row['text'])  # The redacted text
+        redact_annot.set('subject', row['label'])  # The redacted text
+        redact_annot.set('mimetype', "Form")
+        # Set appearance properties
+        redact_annot.set('border-color', colour_str)  # Black border
+        redact_annot.set('repeat', 'false')
+        redact_annot.set('interior-color', colour_str)
+        #redact_annot.set('fill-color', colour_str)
+        #redact_annot.set('outline-color', colour_str)
+        redact_annot.set('overlay-color', colour_str)
+        redact_annot.set('overlay-text', row['label'])
+        redact_annot.set('opacity', "0.5")
+        # Add appearance dictionary
+        # appearanceDict = SubElement(redact_annot, 'appearancedict')
+        # # Normal appearance
+        # normal = SubElement(appearanceDict, 'normal')
+        # #normal.set('appearance', 'redact')
+        # # Color settings for the mark (before applying redaction)
+        # markAppearance = SubElement(redact_annot, 'markappearance')
+        # markAppearance.set('stroke-color', colour_str)  # Red outline
+        # markAppearance.set('fill-color', colour_str)    # Light red fill
+        # markAppearance.set('opacity', '0.5')          # 50% opacity
+        # # Final redaction appearance (after applying)
+        # redactAppearance = SubElement(redact_annot, 'redactAppearance')
+        # redactAppearance.set('fillColor', colour_str)  # Black fill
+        # redactAppearance.set('fontName', 'Helvetica')
+        # redactAppearance.set('fontSize', '12')
+        # redactAppearance.set('textAlignment', 'left')
+        # redactAppearance.set('textColor', colour_str)  # White text
+    # Convert to pretty XML string
+    xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent="  ")
+    return xml_str
+def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths):
+    '''
+    Load in files to convert a review file into an Adobe comment file format
+    '''
+    output_paths = []
+    pdf_name = ""
+    if isinstance(input_files, str):
+        file_paths_list = [input_files]
+    else:
+        file_paths_list = input_files
+    # Sort the file paths so that the pdfs come first
+    file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
+    for file in file_paths_list:
+        if isinstance(file, str):
+            file_path = file
+        else:
+            file_path = file.name
+    file_path_name = get_file_name_without_type(file_path)
+    file_path_end = detect_file_type(file_path)
+    if file_path_end == "pdf":
+        pdf_name = os.path.basename(file_path)
+    if file_path_end == "csv":
+        # If no pdf name, just get the name of the file path
+        if not pdf_name:
+            pdf_name = file_path_name
+        # Read CSV file
+        df = pd.read_csv(file_path)
+        df.fillna('', inplace=True)  # Replace NaN with an empty string
+        xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
+        output_path = output_folder + file_path_name + "_adobe.xfdf"
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(xfdf_content)
+        output_paths.append(output_path)
+    return output_paths
+### Convert xfdf coordinates back to image for app
+def convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_width, image_height, x1, y1, x2, y2):
+    '''
+    Converts coordinates from Adobe PDF space to image space.
+    Parameters:
+    - pdf_page_width: Width of the PDF page
+    - pdf_page_height: Height of the PDF page
+    - image_width: Width of the source image
+    - image_height: Height of the source image
+    - x1, y1, x2, y2: Coordinates in Adobe PDF space
+    Returns:
+    - Tuple of converted coordinates (x1, y1, x2, y2) in image space
+    '''
+    # Calculate scaling factors
+    scale_width = image_width / pdf_page_width
+    scale_height = image_height / pdf_page_height
+    # Convert coordinates
+    image_x1 = x1 * scale_width
+    image_x2 = x2 * scale_width
+    # Convert Y coordinates (flip vertical axis)
+    # Adobe coordinates start from bottom-left
+    image_y1 = (pdf_page_height - y1) * scale_height
+    image_y2 = (pdf_page_height - y2) * scale_height
+    # Make sure y1 is always less than y2 for image's coordinate system
+    if image_y1 > image_y2:
+        image_y1, image_y2 = image_y2, image_y1
+    return image_x1, image_y1, image_x2, image_y2
+def parse_xfdf(xfdf_path):
+    '''
+    Parse the XFDF file and extract redaction annotations.
+    Parameters:
+    - xfdf_path: Path to the XFDF file
+    Returns:
+    - List of dictionaries containing redaction information
+    '''
+    tree = parse(xfdf_path)
+    root = tree.getroot()
+    # Define the namespace
+    namespace = {'xfdf': 'http://ns.adobe.com/xfdf/'}
+    redactions = []
+    # Find all redact elements using the namespace
+    for redact in root.findall('.//xfdf:redact', namespaces=namespace):
+        #print("redact:", redact)
+        redaction_info = {
+            'image': '', # Image will be filled in later
+            'page': int(redact.get('page')) + 1,  # Convert to 1-based index
+            'xmin': float(redact.get('rect').split(',')[0]),
+            'ymin': float(redact.get('rect').split(',')[1]),
+            'xmax': float(redact.get('rect').split(',')[2]),
+            'ymax': float(redact.get('rect').split(',')[3]),
+            'label': redact.get('title'),
+            'text': redact.get('contents'),
+            'color': redact.get('border-color', '(0, 0, 0)')  # Default to black if not specified
+        }
+        redactions.append(redaction_info)
+        print("redactions:", redactions)
+    return redactions
+def convert_xfdf_to_dataframe(file_paths_list, pymupdf_doc, image_paths):
+    '''
+    Convert redaction annotations from XFDF and associated images into a DataFrame.
+    Parameters:
+    - xfdf_path: Path to the XFDF file
+    - pdf_doc: PyMuPDF document object
+    - image_paths: List of PIL Image objects corresponding to PDF pages
+    Returns:
+    - DataFrame containing redaction information
+    '''
+    output_paths = []
+    xfdf_paths = []
+    df = pd.DataFrame()
+    #print("Image paths:", image_paths)
+    # Sort the file paths so that the pdfs come first
+    file_paths_list = sorted(file_paths_list, key=lambda x: (os.path.splitext(x)[1] != '.pdf', os.path.splitext(x)[1] != '.json'))
+    for file in file_paths_list:
+        if isinstance(file, str):
+            file_path = file
+        else:
+            file_path = file.name
+        file_path_name = get_file_name_without_type(file_path)
+        file_path_end = detect_file_type(file_path)
+        if file_path_end == "pdf":
+            pdf_name = os.path.basename(file_path)
+            #print("pymupdf_doc:", pymupdf_doc)
+            # Add pdf to outputs
+            output_paths.append(file_path)
+        if file_path_end == "xfdf":
+            if not pdf_name:
+                message = "Original PDF needed to convert from .xfdf format"
+                print(message)
+                raise ValueError(message)
+            xfdf_path = file
+            # if isinstance(xfdf_paths, str):
+            #     xfdf_path = xfdf_paths.name
+            # else:
+            #     xfdf_path = xfdf_paths[0].name
+            file_path_name = get_file_name_without_type(xfdf_path)
+            #print("file_path_name:", file_path_name)
+            # Parse the XFDF file
+            redactions = parse_xfdf(xfdf_path)
+            # Create a DataFrame from the redaction information
+            df = pd.DataFrame(redactions)
+            df.fillna('', inplace=True)  # Replace NaN with an empty string
+            for _, row in df.iterrows():
+                page_python_format = int(row["page"])-1
+                pymupdf_page = pymupdf_doc.load_page(page_python_format)
+                pdf_page_height = pymupdf_page.rect.height
+                pdf_page_width = pymupdf_page.rect.width
+                image_path = image_paths[page_python_format]
+                #print("image_path:", image_path)
+                if isinstance(image_path, str):
+                    image = Image.open(image_path)
+                image_page_width, image_page_height = image.size
+                # Convert to image coordinates
+                image_x1, image_y1, image_x2, image_y2 = convert_adobe_coords_to_image(pdf_page_width, pdf_page_height, image_page_width, image_page_height, row['xmin'], row['ymin'], row['xmax'], row['ymax'])
+                df.loc[_, ['xmin', 'ymin', 'xmax', 'ymax']] = [image_x1, image_y1, image_x2, image_y2]
+                # Optionally, you can add the image path or other relevant information
+                #print("Image path:", image_path)
+                df.loc[_, 'image'] = image_path
+                #print('row:', row)
+    out_file_path = output_folder + file_path_name + "_review_file.csv"
+    df.to_csv(out_file_path, index=None)
+    output_paths.append(out_file_path)
+    return output_paths