Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Mar 21

Commit

66e145d

1 Parent(s): 08a3ec3

Added features to review dataframe to filter and exclude features based on text. Text should now appear consistently in review_df (for boxes not modified). Larger spacy model returned to use. Gradio upgrade.

Browse files

Files changed (9) hide show

DocRedactApp_0.2.0.spec +0 -66
app.py +111 -74
requirements.txt +4 -4
tools/aws_textract.py +86 -3
tools/file_conversion.py +254 -95
tools/file_redaction.py +132 -76
tools/helper_functions.py +3 -3
tools/load_spacy_model_custom_recognisers.py +3 -3
tools/redaction_review.py +286 -81

DocRedactApp_0.2.0.spec DELETED Viewed

@@ -1,66 +0,0 @@
-# -*- mode: python ; coding: utf-8 -*-
-from PyInstaller.utils.hooks import collect_data_files
-from PyInstaller.utils.hooks import collect_all
-datas = [('tesseract/', 'tesseract/'), ('poppler/poppler-24.02.0/', 'poppler/poppler-24.02.0/')]
-binaries = []
-hiddenimports = ['gradio_image_annotation', 'pyarrow.vendored.version', 'pydicom.encoders', 'safehttpx', 'presidio_analyzer', 'presidio_anonymizer', 'presidio_image_redactor']
-datas += collect_data_files('gradio_client')
-datas += collect_data_files('gradio')
-datas += collect_data_files('gradio_image_annotation')
-tmp_ret = collect_all('gradio_image_annotation')
-datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
-tmp_ret = collect_all('safehttpx')
-datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
-tmp_ret = collect_all('presidio_analyzer')
-datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
-tmp_ret = collect_all('presidio_anonymizer')
-datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
-tmp_ret = collect_all('presidio_image_redactor')
-datas += tmp_ret[0]; binaries += tmp_ret[1]; hiddenimports += tmp_ret[2]
-a = Analysis(
-    ['app.py'],
-    pathex=[],
-    binaries=binaries,
-    datas=datas,
-    hiddenimports=hiddenimports,
-    hookspath=['build_deps'],
-    hooksconfig={},
-    runtime_hooks=[],
-    excludes=[],
-    noarchive=False,
-    optimize=0,
-    module_collection_mode={
-        'gradio': 'py',  # Collect gradio package as source .py files
-    }
-)
-pyz = PYZ(a.pure)
-exe = EXE(
-    pyz,
-    a.scripts,
-    [],
-    exclude_binaries=True,
-    name='DocRedactApp_0.2.0',
-    debug=False,
-    bootloader_ignore_signals=False,
-    strip=False,
-    upx=True,
-    console=True,
-    disable_windowed_traceback=False,
-    argv_emulation=False,
-    target_arch=None,
-    codesign_identity=None,
-    entitlements_file=None,
-)
-coll = COLLECT(
-    exe,
-    a.binaries,
-    a.datas,
-    strip=False,
-    upx=True,
-    upx_exclude=[],
-    name='DocRedactApp_0.2.0',
-)

app.py CHANGED Viewed

@@ -14,7 +14,7 @@ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_pa
 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
-from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
@@ -81,15 +81,22 @@ with app:
     first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
     second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
     do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
     prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
-    document_cropboxes = gr.Dropdown(label = "document_cropboxes", value="", allow_custom_value=True,visible=False)
     images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
     output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
     output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
     text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
     log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
     # Logging state
@@ -115,6 +122,11 @@ with app:
     data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
     data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
     data_file_name_textbox_list = gr.Dropdown(label = "data_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
     estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
@@ -131,17 +143,14 @@ with app:
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
-    default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
-    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=False, type="pandas")
     default_deny_list_file_name = "default_deny_list.csv"
-    default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
-    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
-    fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
-    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=False, type="pandas")
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
@@ -150,14 +159,12 @@ with app:
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     # Base dataframe for recognisers that is not modified subsequent to load
-    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False, label="recogniser_entity_dataframe_base")
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
     duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
     ###
     # UI DESIGN
     ###
@@ -178,7 +185,7 @@ with app:
     ###
     with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
-            in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             # if RUN_AWS_FUNCTIONS == "1":
             in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
             pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
@@ -220,14 +227,19 @@ with app:
             annotate_zoom_out = gr.Button("Zoom out", visible=False)
         with gr.Row():
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
-        with gr.Row():
-            annotation_last_page_button = gr.Button("Previous page", scale = 3)
-            annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
-            annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
-            annotation_next_page_button = gr.Button("Next page", scale = 3)
         with gr.Row():
-            with gr.Column(scale=3):
                 zoom_str = str(annotator_zoom_number) + '%'
@@ -249,17 +261,25 @@ with app:
                     interactive=False
                 )
             with gr.Column(scale=1):
-                #with gr.Row():
-                recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
-                recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=(2,"fixed"), type="pandas", label="Search results. Click to go to page")
-        with gr.Row():
-            annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
-            annotate_current_page_bottom = gr.Number(value=1, label="Page (press enter to change)", precision=0, interactive=True, scale = 2)
-            annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
-            annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
@@ -306,9 +326,7 @@ with app:
     with gr.Tab(label="Identify duplicate pages"):
         with gr.Accordion("Identify duplicate pages to redact", open = True):
             in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
-            find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
             duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
     ###
@@ -326,6 +344,11 @@ with app:
                 with gr.Column():
                     in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
                     in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
@@ -370,92 +393,106 @@ with app:
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
-    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
         success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
-                    success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
-    # If the app has completed a batch of pages, it will run this until the end of all pages in the document
-    current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
-                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
-                    success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # If a file has been completed, the function will continue onto the next document
-    latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-                    success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     ###
     # REVIEW PDF REDACTIONS
     ###
     # Upload previous files for modifying redactions
-    upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes], api_name="prepare_doc").\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
     # Page controls at top
     annotate_current_page.submit(
-        modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Zoom in and out on annotator
-    annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
-    annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
         success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
-    annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
-    clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
-    annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
-        modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
-        success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Review table controls
-    recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
     recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
-    success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
-        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
-        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
-        success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
-        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###

 from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
 from tools.file_redaction import choose_and_run_redactor
 from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
+from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
 from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
     first_loop_state = gr.Checkbox(label="first_loop_state", value=True, visible=False)
     second_loop_state = gr.Checkbox(label="second_loop_state", value=False, visible=False)
     do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
+    save_pdf_state = gr.Checkbox(label="save_pdf_state", value=True, visible=False)
     prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
+    document_cropboxes = gr.Dropdown(label = "document_cropboxes", value="", allow_custom_value=True,visible=False)
+    page_sizes = gr.Dropdown(label = "page_sizes", value="", allow_custom_value=True, visible=False)
     images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
     output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
     output_file_list_state = gr.Dropdown(label = "output_file_list", value="", allow_custom_value=True,visible=False)
     text_output_file_list_state = gr.Dropdown(label = "text_output_file_list", value="", allow_custom_value=True,visible=False)
     log_files_output_list_state = gr.Dropdown(label = "log_files_output_list", value="", allow_custom_value=True,visible=False)
+    # Backup versions of these objects in case you make a mistake
+    backup_review_state = gr.Dataframe(visible=False)
+    backup_image_annotations_state = gr.State([])
+    backup_recogniser_entity_dataframe_base = gr.Dataframe(visible=False)
     # Logging state
     data_file_name_no_extension_textbox = gr.Textbox(label = "data_full_file_name_textbox", value="", visible=False)
     data_file_name_with_extension_textbox = gr.Textbox(label = "data_file_name_with_extension_textbox", value="", visible=False)
     data_file_name_textbox_list = gr.Dropdown(label = "data_file_name_textbox_list", value="", allow_custom_value=True,visible=False)
+    # Constants just to use with the review dropdowns for filtering by various columns
+    label_name_const = gr.Textbox(label="label_name_const", value="label", visible=False)
+    text_name_const = gr.Textbox(label="text_name_const", value="text", visible=False)
+    page_name_const = gr.Textbox(label="page_name_const", value="page", visible=False)
     estimated_time_taken_number = gr.Number(label = "estimated_time_taken_number", value=0.0, precision=1, visible=False) # This keeps track of the time taken to redact files for logging purposes.
     annotate_previous_page = gr.Number(value=0, label="Previous page", precision=0, visible=False) # Keeps track of the last page that the annotator was on
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
+    default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
     default_deny_list_file_name = "default_deny_list.csv"
+    default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
+    fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
     default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
     # Base dataframe for recognisers that is not modified subsequent to load
+    recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"])
     # Duplicate page detection
     in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
     duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="duplicate_pages_df", visible=False, type="pandas")
     ###
     # UI DESIGN
     ###
     ###
     with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
+            in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             # if RUN_AWS_FUNCTIONS == "1":
             in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
             pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
             annotate_zoom_out = gr.Button("Zoom out", visible=False)
         with gr.Row():
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
+        with gr.Row():
+            with gr.Column(scale=2):
+                with gr.Row(equal_height=True):
+                    annotation_last_page_button = gr.Button("Previous page", scale = 4)
+                    annotate_current_page = gr.Number(value=1, label="Current page", precision=0, scale = 2, min_width=50)
+                    annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
+                    annotation_next_page_button = gr.Button("Next page", scale = 4)
+            with gr.Column(scale=1):
+                blank_markdown_top = gr.Markdown(value="", label="")
         with gr.Row():
+            with gr.Column(scale=2):
                 zoom_str = str(annotator_zoom_number) + '%'
                     interactive=False
                 )
             with gr.Column(scale=1):
+                with gr.Row():
+                    recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
+                    page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
+                text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
+                recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
+                with gr.Row():
+                    reset_dropdowns_btn = gr.Button(value="Reset filters")
+                    exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
+                undo_last_removal_btn = gr.Button(value="Undo last element removal")
+        with gr.Row():
+            with gr.Column(scale=2):
+                with gr.Row(equal_height=True):
+                    annotation_last_page_button_bottom = gr.Button("Previous page", scale = 4)
+                    annotate_current_page_bottom = gr.Number(value=1, label="Current page", precision=0, interactive=True, scale = 2, min_width=50)
+                    annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
+                    annotation_next_page_button_bottom = gr.Button("Next page", scale = 4)
+            with gr.Column(scale=1):
+                blank_markdown_bot = gr.Markdown(value="", label="")
         with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
             convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
     with gr.Tab(label="Identify duplicate pages"):
         with gr.Accordion("Identify duplicate pages to redact", open = True):
             in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
+            find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
             duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
     ###
                 with gr.Column():
                     in_fully_redacted_list = gr.File(label="Import fully redacted pages list - csv table with one column of page numbers on each row. Page numbers in this file will be fully redacted.", file_count="multiple", height=file_input_height)
                     in_fully_redacted_list_text = gr.Textbox(label="Fully redacted page list load status")
+            with gr.Accordion("Manually modify custom allow, deny, and full page redaction lists", open = False):
+                with gr.Row():
+                    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=True, type="pandas")
+                    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=True, type="pandas")
+                    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=True, type="pandas")
         with gr.Accordion("Select entity types to redact", open = True):
                 in_redact_entities = gr.Dropdown(value=chosen_redact_entities, choices=full_entity_list, multiselect=True, label="Local PII identification model (click empty space in box for full list)")
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
+    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
         success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
+                    outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes], api_name="redact_doc").\
+                    success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
+    # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
+    # current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
+    #                 outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes]).\
+    #                 success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     # If a file has been completed, the function will continue onto the next document
+    # latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
+    #                 outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes]).\
+    #                 success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+    #                 success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
     ###
     # REVIEW PDF REDACTIONS
     ###
     # Upload previous files for modifying redactions
+    upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
         success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes], api_name="prepare_doc").\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     # Page controls at top
     annotate_current_page.submit(
+        modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Zoom in and out on annotator
+    annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
         success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
+    annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
         success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
+    annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
+    clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
+    annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
     # Page controls at bottom
     annotate_current_page_bottom.submit(
+        modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
+        success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
     # Review table controls
+    recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
+    page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
+    text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
     recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
+    success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
+        success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
+    reset_dropdowns_btn.click(reset_dropdowns, outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
+    exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])#.\
+        #success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
+    undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
+        success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
     # Convert review file to xfdf Adobe format
     convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
         success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
     # Convert xfdf Adobe file back to review_file.csv
     convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
+        success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
         success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
     ###

requirements.txt CHANGED Viewed

@@ -9,10 +9,10 @@ pikepdf==9.5.2
 pandas==2.2.3
 nltk==3.9.1
 scikit-learn==1.6.1
-spacy==3.8.3
-#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
-en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
-gradio==5.18.0
 boto3==1.36.26
 pyarrow==19.0.1
 openpyxl==3.1.5

 pandas==2.2.3
 nltk==3.9.1
 scikit-learn==1.6.1
+spacy==3.8.4
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
+gradio==5.22.0
 boto3==1.36.26
 pyarrow==19.0.1
 openpyxl==3.1.5

tools/aws_textract.py CHANGED Viewed

@@ -2,7 +2,9 @@ import boto3
 #from PIL import Image
 from typing import List
 import io
-#import json
 import pikepdf
 import time
 # Example: converting this single page to an image
@@ -26,7 +28,7 @@ def extract_textract_metadata(response):
         #'NumberOfPages': number_of_pages
     })
-def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
     '''
     Analyse page with AWS Textract
     '''
@@ -65,6 +67,11 @@ def analyse_page_with_textract(pdf_page_bytes, page_no, client="", handwrite_sig
             time.sleep(5)
             response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
     # Wrap the response with the page number in the desired format
     wrapped_response = {
         'page_no': page_no,
@@ -265,4 +272,80 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
             i += 1
-    return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children

 #from PIL import Image
 from typing import List
 import io
+import os
+import json
+from collections import defaultdict
 import pikepdf
 import time
 # Example: converting this single page to an image
         #'NumberOfPages': number_of_pages
     })
+def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str="", handwrite_signature_checkbox:List[str]=["Redact all identified handwriting", "Redact all identified signatures"]):
     '''
     Analyse page with AWS Textract
     '''
             time.sleep(5)
             response = client.detect_document_text(Document={'Bytes': pdf_page_bytes})
+     # Add the 'Page' attribute to each block
+    if "Blocks" in response:
+        for block in response["Blocks"]:
+            block["Page"] = page_no  # Inject the page number into each block
     # Wrap the response with the page number in the desired format
     wrapped_response = {
         'page_no': page_no,
             i += 1
+    return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
+def load_and_convert_textract_json(textract_json_file_path, log_files_output_paths):
+    """
+    Loads Textract JSON from a file, detects if conversion is needed,
+    and converts if necessary.
+    """
+    if not os.path.exists(textract_json_file_path):
+        print("No existing Textract results file found.")
+        return {}, True, log_files_output_paths  # Return empty dict and flag indicating missing file
+    no_textract_file = False
+    print("Found existing Textract json results file.")
+    # Track log files
+    if textract_json_file_path not in log_files_output_paths:
+        log_files_output_paths.append(textract_json_file_path)
+    try:
+        with open(textract_json_file_path, 'r', encoding='utf-8') as json_file:
+            textract_data = json.load(json_file)
+    except json.JSONDecodeError:
+        print("Error: Failed to parse Textract JSON file. Returning empty data.")
+        return {}, True, log_files_output_paths  # Indicate failure
+    # Check if conversion is needed
+    if "pages" in textract_data:
+        print("JSON already in the new format. No changes needed.")
+        return textract_data, False, log_files_output_paths  # No conversion required
+    if "Blocks" in textract_data:
+        print("Need to convert Textract JSON to app format.")
+        try:
+            from tools.aws_textract import restructure_textract_output
+            textract_data = restructure_textract_output(textract_data)
+            return textract_data, False, log_files_output_paths  # Successfully converted
+        except Exception as e:
+            print("Failed to convert JSON data to app format due to:", e)
+            return {}, True, log_files_output_paths  # Conversion failed
+    else:
+        print("Invalid Textract JSON format: 'Blocks' missing.")
+        print("textract data:", textract_data)
+        return {}, True, log_files_output_paths  # Return empty data if JSON is not recognized
+# Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
+def restructure_textract_output(textract_output:object):
+    '''
+    Reorganise textract output that comes from the bulk textract analysis option on AWS to format that works in this app.
+    '''
+    pages_dict = defaultdict(lambda: {"page_no": None, "data": {"Blocks": []}})
+    # Extract number of pages from DocumentMetadata
+    total_pages = textract_output.get("DocumentMetadata", {}).get("Pages", 1)
+    for block in textract_output.get("Blocks", []):
+        page_no = block.get("Page", 1)  # Default to 1 if not present
+        # Ensure page metadata is only set once
+        if pages_dict[page_no]["page_no"] is None:
+            pages_dict[page_no]["page_no"] = str(page_no)
+        # Add block to corresponding page
+        pages_dict[page_no]["data"]["Blocks"].append(block)
+    # Convert dictionary to sorted list of pages
+    structured_output = {
+        "pages": [pages_dict[page] for page in sorted(pages_dict.keys())]
+    }
+    # Add DocumentMetadata to the first page's data (optional)
+    if structured_output["pages"]:
+        structured_output["pages"][0]["data"]["DocumentMetadata"] = textract_output.get("DocumentMetadata", {})
+    return structured_output

tools/file_conversion.py CHANGED Viewed

@@ -8,12 +8,16 @@ import json
 import pymupdf
 import pandas as pd
 import numpy as np
 from pymupdf import Rect
 from fitz import Page
 from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
 image_dpi = 300.0
 ImageFile.LOAD_TRUNCATED_IMAGES = True
@@ -53,9 +57,41 @@ def is_pdf(filename):
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
 print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
-import os
-from pdf2image import convert_from_path
-from PIL import Image
 def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
     try:
@@ -75,38 +111,16 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
             image = image.convert("L")
             image.save(out_path, format="PNG")
-        # Check file size and resize if necessary
-        max_size = 4.5 * 1024 * 1024  # 5 MB in bytes # 5
-        file_size = os.path.getsize(out_path)
-        # Resize images if they are too big
-        if file_size > max_size:
-            # Start with the original image size
-            width, height = image.size
-            print(f"Image size before {width}x{height}, original file_size: {file_size}")
-            while file_size > max_size:
-                # Reduce the size by a factor (e.g., 50% of the current size)
-                new_width = int(width * 0.5)
-                new_height = int(height * 0.5)
-                image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
-                # Save the resized image
-                image.save(out_path, format="PNG", optimize=True)
-                # Update the file size
-                file_size = os.path.getsize(out_path)
-                print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
-                # Update the dimensions for the next iteration
-                width, height = new_width, new_height
-        return page_num, out_path
     except Exception as e:
         print(f"Error processing page {page_num + 1}: {e}")
-        return page_num, None
 def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
@@ -125,44 +139,49 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
             futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
         for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
-            page_num, result = future.result()
             if result:
-                results.append((page_num, result))
             else:
                 print(f"Page {page_num + 1} failed to process.")
     # Sort results by page number
     results.sort(key=lambda x: x[0])
     images = [result[1] for result in results]
     print("PDF has been converted to images.")
-    return images
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path:str, prepare_for_review:bool=False):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
     # Check if the file is an image type
     if file_extension in ['.jpg', '.jpeg', '.png']:
         print(f"{file_path} is an image file.")
         # Perform image processing here
         img_object = [file_path] #[Image.open(file_path)]
-        # Load images from the file paths
     # Check if the file is a PDF
     elif file_extension == '.pdf':
         print(f"{file_path} is a PDF file. Converting to image set")
         # Run your function for processing PDF files here
-        img_object = convert_pdf_to_images(file_path, prepare_for_review)
     else:
         print(f"{file_path} is not an image or PDF file.")
-        img_object = ['']
-    return img_object
 def get_input_file_names(file_input:List[str]):
     '''
@@ -351,6 +370,7 @@ def prepare_image_or_pdf(
     all_annotations_object:List = [],
     prepare_for_review:bool = False,
     in_fully_redacted_list:List[int]=[],
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
@@ -369,7 +389,8 @@ def prepare_image_or_pdf(
         all_annotations_object(optional, List of annotation objects): All annotations for current document
         prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
         in_fully_redacted_list(optional, List of int): A list of pages to fully redact
-        progress (optional, Progress): Progress tracker for the operation.
     Returns:
@@ -381,7 +402,8 @@ def prepare_image_or_pdf(
     original_cropboxes = []  # Store original CropBox values
     if isinstance(in_fully_redacted_list, pd.DataFrame):
-        in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
@@ -433,7 +455,7 @@ def prepare_image_or_pdf(
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
-        return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
@@ -475,13 +497,22 @@ def prepare_image_or_pdf(
         if is_pdf(file_path):
             pymupdf_doc = pymupdf.open(file_path)
-            # Load cropbox dimensions to use later
-            for page in pymupdf_doc:
-                original_cropboxes.append(page.cropbox)  # Save original CropBox
             converted_file_path = file_path
-            image_file_paths = process_file(file_path, prepare_for_review)
             #Create base version of the annotation object that doesn't have any annotations in it
             if (not all_annotations_object) & (prepare_for_review == True):
@@ -503,14 +534,20 @@ def prepare_image_or_pdf(
             img = Image.open(file_path)  # Open the image file
             rect = pymupdf.Rect(0, 0, img.width, img.height)  # Create a rectangle for the image
-            page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
-            page.insert_image(rect, filename=file_path)  # Insert the image into the page
             file_path_str = str(file_path)
-            image_file_paths = process_file(file_path_str, prepare_for_review)
             #print("image_file_paths:", image_file_paths)
             converted_file_path = output_folder + file_name_with_ext
@@ -520,7 +557,7 @@ def prepare_image_or_pdf(
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
-            all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths)
             json_from_csv = True
             print("Converted CSV review file to json")
@@ -537,13 +574,14 @@ def prepare_image_or_pdf(
                     all_annotations_object = json.loads(file_path)  # Use loads for string content
             # Assume it's a textract json
-            elif (file_extension in ['.json']) & (prepare_for_review != True):
-                # If the file loaded has end textract.json, assume this is a textract response object. Save this to the output folder so it can be found later during redaction and go to the next file.
-                json_contents = json.load(file_path)
-                # Write the response to a JSON file in output folder
-                out_folder = output_folder + file_path_without_ext + ".json"
-                with open(out_folder, 'w') as json_file:
-                    json.dump(json_contents, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
                 continue
             # If you have an annotations object from the above code
@@ -600,16 +638,16 @@ def prepare_image_or_pdf(
                     #print("all_annotations_object at end of json/csv load part:", all_annotations_object)
                 # Get list of pages that are to be fully redacted and redact them
-                if in_fully_redacted_list:
-                    print("Redacting whole pages")
-                    for i, image in enumerate(image_file_paths):
-                        page = pymupdf_doc.load_page(i)
-                        rect_height = page.rect.height
-                        rect_width = page.rect.width
-                        whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
-                        all_annotations_object.append(whole_page_img_annotation_box)
                 # Write the response to a JSON file in output folder
                 out_folder = output_folder + file_path_without_ext + ".json"
@@ -645,7 +683,7 @@ def prepare_image_or_pdf(
     number_of_pages = len(image_file_paths)
-    return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
     file_path_without_ext = get_file_name_without_type(in_file_path)
@@ -655,7 +693,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
     # Convert annotated text pdf back to image to give genuine redactions
     print("Creating image version of redacted PDF to embed redactions.")
-    pdf_text_image_paths = process_file(out_text_file_path[0])
     out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
     pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
@@ -701,12 +739,13 @@ def join_values_within_threshold(df1, df2):
     print(final_df)
-def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame()) -> pd.DataFrame:
     '''
     Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
     '''
     # Flatten the data
     flattened_annotation_data = []
     if not isinstance(redaction_decision_output, pd.DataFrame):
         redaction_decision_output = pd.DataFrame()
@@ -739,54 +778,171 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
             flattened_annotation_data.append(data_to_add)
     # Convert to a DataFrame
-    annotation_data_as_df = pd.DataFrame(flattened_annotation_data)
     #print("redaction_decision_output:", redaction_decision_output)
-    #print("annotation_data_as_df:", annotation_data_as_df)
     # Join on additional text data from decision output results if included, if text not already there
-    if not redaction_decision_output.empty:
-        #print("redaction_decision_output is not empty")
-        #print("redaction_decision_output:", redaction_decision_output)
-        #print("annotation_data_as_df:", annotation_data_as_df)
-        redaction_decision_output['page'] = redaction_decision_output['page'].astype(str)
-        annotation_data_as_df['page'] = annotation_data_as_df['page'].astype(str)
-        redaction_decision_output = redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
-        # Round to the closest number divisible by 5
-        redaction_decision_output.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
-        redaction_decision_output = redaction_decision_output.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
-        #annotation_data_as_df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
-        annotation_data_as_df.loc[:, ['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
-        annotation_data_as_df = annotation_data_as_df.merge(redaction_decision_output, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
-        annotation_data_as_df = annotation_data_as_df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
-        annotation_data_as_df = annotation_data_as_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
     # Ensure required columns exist, filling with blank if they don't
     for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
-        if col not in annotation_data_as_df.columns:
-            annotation_data_as_df[col] = ''
-    for col in ['xmin', 'xmax', 'ymin', 'ymax']:
-        annotation_data_as_df[col] = np.floor(annotation_data_as_df[col])
-    annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
-    return annotation_data_as_df
-def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
     '''
-    Convert a review csv to a json file for use by the Gradio Annotation object
     '''
     # Keep only necessary columns
     review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
     # Group the DataFrame by the 'image' column
     grouped_csv_pages = review_file_df.groupby('page')
@@ -795,6 +951,7 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
     for n, pdf_image_path in enumerate(image_paths):
         reported_page_number = int(n + 1)
         if reported_page_number in review_file_df["page"].values:
@@ -802,6 +959,8 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
             selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
             annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
             annotation = {
                 "image": pdf_image_path,
                 "boxes": annotation_boxes

 import pymupdf
 import pandas as pd
 import numpy as np
+import shutil
 from pymupdf import Rect
 from fitz import Page
 from tqdm import tqdm
 from gradio import Progress
 from typing import List, Optional
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from pdf2image import convert_from_path
+from PIL import Image
+from scipy.spatial import cKDTree
 image_dpi = 300.0
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
 print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
+def check_image_size_and_reduce(out_path:str, image:Image):
+    '''
+    Check if a given image size is above around 4.5mb, and reduce size if necessary. 5mb is the maximum possible to submit to AWS Textract.
+    '''
+    # Check file size and resize if necessary
+    max_size = 4.5 * 1024 * 1024  # 5 MB in bytes # 5
+    file_size = os.path.getsize(out_path)
+    width = image.width
+    height = image.height
+    # Resize images if they are too big
+    if file_size > max_size:
+        # Start with the original image size
+        print(f"Image size before {width}x{height}, original file_size: {file_size}")
+        while file_size > max_size:
+            # Reduce the size by a factor (e.g., 50% of the current size)
+            new_width = int(width * 0.5)
+            new_height = int(height * 0.5)
+            image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+            # Save the resized image
+            image.save(out_path, format="PNG", optimize=True)
+            # Update the file size
+            file_size = os.path.getsize(out_path)
+            print(f"Resized to {new_width}x{new_height}, new file_size: {file_size}")
+    else:
+        new_width = width
+        new_height = height
+    return new_width, new_height
 def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
     try:
             image = image.convert("L")
             image.save(out_path, format="PNG")
+        width, height = image.size
+        # Check if image size too large and reduce if necessary
+        width, height = check_image_size_and_reduce(out_path, image)
+        return page_num, out_path, width, height
     except Exception as e:
         print(f"Error processing page {page_num + 1}: {e}")
+        return page_num, "", width, height
 def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min: int = 0, image_dpi: float = image_dpi, num_threads: int = 8, output_dir: str = '/input'):
             futures.append(executor.submit(process_single_page, pdf_path, page_num, image_dpi))
         for future in tqdm(as_completed(futures), total=len(futures), unit="pages", desc="Converting pages"):
+            page_num, result, width, height = future.result()
             if result:
+                results.append((page_num, result, width, height))
             else:
                 print(f"Page {page_num + 1} failed to process.")
     # Sort results by page number
     results.sort(key=lambda x: x[0])
     images = [result[1] for result in results]
+    widths = [result[2] for result in results]
+    heights = [result[3] for result in results]
     print("PDF has been converted to images.")
+    return images, widths, heights
 # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
 def process_file(file_path:str, prepare_for_review:bool=False):
     # Get the file extension
     file_extension = os.path.splitext(file_path)[1].lower()
     # Check if the file is an image type
     if file_extension in ['.jpg', '.jpeg', '.png']:
         print(f"{file_path} is an image file.")
         # Perform image processing here
         img_object = [file_path] #[Image.open(file_path)]
+        # Load images from the file paths. Test to see if it is bigger than 4.5 mb and reduct if needed (Textract limit is 5mb)
+        image = Image.open(file_path)
+        img_object, image_sizes_width, image_sizes_height = check_image_size_and_reduce(file_path, image)
     # Check if the file is a PDF
     elif file_extension == '.pdf':
         print(f"{file_path} is a PDF file. Converting to image set")
         # Run your function for processing PDF files here
+        img_object, image_sizes_width, image_sizes_height = convert_pdf_to_images(file_path, prepare_for_review)
     else:
         print(f"{file_path} is not an image or PDF file.")
+        img_object = []
+        image_sizes_width = []
+        image_sizes_height = []
+    return img_object, image_sizes_width, image_sizes_height
 def get_input_file_names(file_input:List[str]):
     '''
     all_annotations_object:List = [],
     prepare_for_review:bool = False,
     in_fully_redacted_list:List[int]=[],
+    output_folder:str=output_folder,
     progress: Progress = Progress(track_tqdm=True)
 ) -> tuple[List[str], List[str]]:
     """
         all_annotations_object(optional, List of annotation objects): All annotations for current document
         prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
         in_fully_redacted_list(optional, List of int): A list of pages to fully redact
+        output_folder (optional, str): The output folder for file save
+        progress (optional, Progress): Progress tracker for the operation
     Returns:
     original_cropboxes = []  # Store original CropBox values
     if isinstance(in_fully_redacted_list, pd.DataFrame):
+        if not in_fully_redacted_list.empty:
+            in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
             final_out_message = '\n'.join(out_message)
         else:
             final_out_message = out_message
+        return final_out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes
     #in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
         if is_pdf(file_path):
             pymupdf_doc = pymupdf.open(file_path)
+            # Load cropbox dimensions to use later
             converted_file_path = file_path
+            image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
+            page_sizes = []
+            for i, page in enumerate(pymupdf_doc):
+                page_no = i
+                reported_page_no = i + 1
+                pymupdf_page = pymupdf_doc.load_page(page_no)
+                original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
+                # Create a page_sizes_object
+                out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
+                page_sizes.append(out_page_image_sizes)
             #Create base version of the annotation object that doesn't have any annotations in it
             if (not all_annotations_object) & (prepare_for_review == True):
             img = Image.open(file_path)  # Open the image file
             rect = pymupdf.Rect(0, 0, img.width, img.height)  # Create a rectangle for the image
+            pymupdf_page = pymupdf_doc.new_page(width=img.width, height=img.height)  # Add a new page
+            pymupdf_page.insert_image(rect, filename=file_path)  # Insert the image into the page
+            pymupdf_page = pymupdf_doc.load_page(0)
+            original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
             file_path_str = str(file_path)
+            image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path_str, prepare_for_review)
             #print("image_file_paths:", image_file_paths)
+            # Create a page_sizes_object
+            out_page_image_sizes = {"page":1, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
+            page_sizes.append(out_page_image_sizes)
             converted_file_path = output_folder + file_name_with_ext
         elif file_extension in ['.csv']:
             review_file_csv = read_file(file)
+            all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths, page_sizes)
             json_from_csv = True
             print("Converted CSV review file to json")
                     all_annotations_object = json.loads(file_path)  # Use loads for string content
             # Assume it's a textract json
+            elif (file_extension == '.json') and (prepare_for_review is not True):
+                # If the file ends with textract.json, assume it's a Textract response object.
+                # Copy it to the output folder so it can be used later.
+                out_folder = os.path.join(output_folder, file_path_without_ext + ".json")
+                # Use shutil to copy the file directly
+                shutil.copy2(file_path, out_folder)  # Preserves metadata
                 continue
             # If you have an annotations object from the above code
                     #print("all_annotations_object at end of json/csv load part:", all_annotations_object)
                 # Get list of pages that are to be fully redacted and redact them
+                # if not in_fully_redacted_list.empty:
+                #     print("Redacting whole pages")
+                #     for i, image in enumerate(image_file_paths):
+                #         page = pymupdf_doc.load_page(i)
+                #         rect_height = page.rect.height
+                #         rect_width = page.rect.width
+                #         whole_page_img_annotation_box = redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours = False, border = 5)
+                #         all_annotations_object.append(whole_page_img_annotation_box)
                 # Write the response to a JSON file in output folder
                 out_folder = output_folder + file_path_without_ext + ".json"
     number_of_pages = len(image_file_paths)
+    return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes, page_sizes
 def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
     file_path_without_ext = get_file_name_without_type(in_file_path)
     # Convert annotated text pdf back to image to give genuine redactions
     print("Creating image version of redacted PDF to embed redactions.")
+    pdf_text_image_paths, image_sizes_width, image_sizes_height = process_file(out_text_file_path[0])
     out_text_image_file_path = output_folder + file_path_without_ext + "_text_redacted_as_img.pdf"
     pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=image_dpi, save_all=True, append_images=pdf_text_image_paths[1:])
     print(final_df)
+def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame(), page_sizes:List[dict]=[]) -> pd.DataFrame:
     '''
     Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
     '''
     # Flatten the data
     flattened_annotation_data = []
+    page_sizes_df = pd.DataFrame()
     if not isinstance(redaction_decision_output, pd.DataFrame):
         redaction_decision_output = pd.DataFrame()
             flattened_annotation_data.append(data_to_add)
     # Convert to a DataFrame
+    review_file_df = pd.DataFrame(flattened_annotation_data)
+    if page_sizes:
+        page_sizes_df = pd.DataFrame(page_sizes)
+        page_sizes_df["page"] = page_sizes_df["page"].astype(int)
+    # Convert data to same coordinate system
+    # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
+    if "xmin" in review_file_df.columns:
+        if review_file_df["xmin"].max() >= 1 and review_file_df["xmax"].max() >= 1 and review_file_df["ymin"].max() >= 1 and review_file_df["ymax"].max() >= 1:
+            print("review file df has large coordinates")
+            review_file_df["page"] = review_file_df["page"].astype(int)
+            if "image_width" not in review_file_df.columns and not page_sizes_df.empty:
+                review_file_df = review_file_df.merge(page_sizes_df, on="page", how="left")
+            if "image_width" in review_file_df.columns:
+                print("Dividing coordinates in review file")
+                review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
+                review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
+                review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
+                review_file_df["ymax"] = review_file_df["ymax"] / review_file_df["image_height"]
+                #print("review_file_df after coordinates divided:", review_file_df)
+    if not redaction_decision_output.empty:
+        # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
+        if redaction_decision_output["xmin"].max() >= 1 and redaction_decision_output["xmax"].max() >= 1 and redaction_decision_output["ymin"].max() >= 1 and redaction_decision_output["ymax"].max() >= 1:
+            redaction_decision_output["page"] = redaction_decision_output["page"].astype(int)
+            if "image_width" not in redaction_decision_output.columns and not page_sizes_df.empty:
+                redaction_decision_output = redaction_decision_output.merge(page_sizes_df, on="page", how="left")
+            if "image_width" in redaction_decision_output.columns:
+                redaction_decision_output["xmin"] = redaction_decision_output["xmin"] / redaction_decision_output["image_width"]
+                redaction_decision_output["xmax"] = redaction_decision_output["xmax"] / redaction_decision_output["image_width"]
+                redaction_decision_output["ymin"] = redaction_decision_output["ymin"] / redaction_decision_output["image_height"]
+                redaction_decision_output["ymax"] = redaction_decision_output["ymax"] / redaction_decision_output["image_height"]
+    #print("convert_review_json review_file_df before merges:", review_file_df[['xmin', 'ymin', 'xmax', 'ymax', 'label']])
+    #print("review_file_df[xmin]", review_file_df["xmin"])
     #print("redaction_decision_output:", redaction_decision_output)
+    #print("review_file_df:", review_file_df)
     # Join on additional text data from decision output results if included, if text not already there
+    if not redaction_decision_output.empty:
+        if not 'text' in redaction_decision_output.columns:
+            redaction_decision_output['text'] = ''
+        if not 'text' in review_file_df.columns:
+            review_file_df['text'] = ''
+        # Load DataFrames
+        df1 = review_file_df.copy()
+        df2 = redaction_decision_output.copy()
+        #print("review_file before tolerance merge:", review_file_df)
+        #print("redaction_decision_output before tolerance merge:", redaction_decision_output)
+        # Create a unique key based on coordinates and label for exact merge
+        merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
+        df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
+        df2['key'] = df2[merge_keys].astype(str).agg('_'.join, axis=1)
+        # Attempt exact merge first
+        #merged_df = df1.merge(df2[['key', 'text']], on='key', how='left')
+        # Attempt exact merge first, renaming df2['text'] to avoid suffixes
+        merged_df = df1.merge(df2[['key', 'text']], on='key', how='left', suffixes=('', '_duplicate'))
+        # If a match is found, keep that text; otherwise, keep the original df1 text
+        merged_df['text'] = merged_df['text'].combine_first(merged_df.pop('text_duplicate'))
+        #print("merged_df['text']:", merged_df['text'])
+        # Handle missing matches using a proximity-based approach
+        #if merged_df['text'].isnull().sum() > 0:
+        print("Attempting tolerance-based merge for text")
+        # Convert coordinates to numpy arrays for KDTree lookup
+        tree = cKDTree(df2[['xmin', 'ymin', 'xmax', 'ymax']].values)
+        query_coords = df1[['xmin', 'ymin', 'xmax', 'ymax']].values
+        # Find nearest neighbors within a reasonable tolerance (e.g., 1% of page)
+        tolerance = 0.01
+        distances, indices = tree.query(query_coords, distance_upper_bound=tolerance)
+        # Assign text values where matches are found
+        for i, (dist, idx) in enumerate(zip(distances, indices)):
+            if dist < tolerance and idx < len(df2):
+                merged_df.at[i, 'text'] = df2.iloc[idx]['text']
+        # Drop the temporary key column
+        merged_df.drop(columns=['key'], inplace=True)
+        review_file_df = merged_df
+        review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
     # Ensure required columns exist, filling with blank if they don't
     for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
+        if col not in review_file_df.columns:
+            review_file_df[col] = ''
+    #for col in ['xmin', 'xmax', 'ymin', 'ymax']:
+    #    review_file_df[col] = np.floor(review_file_df[col])
+    # If colours are saved as list, convert to tuple
+    review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
+    # print("page_sizes:", page_sizes)
+    # Convert page sizes to relative values
+    # if page_sizes:
+    #     print("Checking page sizes")
+    #     page_sizes_df = pd.DataFrame(page_sizes)
+    #     if "image_width" not in review_file_df.columns:
+    #         review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
+    #     # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
+    #     if review_file_df["xmin"].max() > 1 and review_file_df["xmax"].max() > 1 and review_file_df["ymin"].max() > 1 and review_file_df["ymax"].max() > 1:
+    #         print("Dividing coordinates by image width and height.")
+    #         review_file_df["xmin"] = review_file_df["xmin"] / review_file_df["image_width"]
+    #         review_file_df["xmax"] = review_file_df["xmax"] / review_file_df["image_width"]
+    #         review_file_df["ymin"] = review_file_df["ymin"] / review_file_df["image_height"]
+    #         review_file_df["ymax"] = review_file_df["ymax"] / review_file_df["image_height"]
+    review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
+    review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
+    return review_file_df
+def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image], page_sizes:List[dict]=[]) -> List[dict]:
     '''
+    Convert a review csv to a json file for use by the Gradio Annotation object.
     '''
+    if page_sizes:
+        page_sizes_df = pd.DataFrame(page_sizes)
+        #print(page_sizes_df)
+        if "image_width" not in review_file_df.columns:
+            review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
+        #print("review_file_df in convert pandas df to review json function:", review_file_df[["xmin", "xmax", "ymin", "ymax"]])
+        # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
+        if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
+            review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
+            review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
+            review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
+            review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
     # Keep only necessary columns
     review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
+    # If colours are saved as list, convert to tuple
+    review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
     # Group the DataFrame by the 'image' column
     grouped_csv_pages = review_file_df.groupby('page')
     for n, pdf_image_path in enumerate(image_paths):
         reported_page_number = int(n + 1)
         if reported_page_number in review_file_df["page"].values:
             selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
             annotation_boxes = selected_csv_pages.drop(columns=['image', 'page']).to_dict(orient='records')
+            # If all bbox coordinates are below 1, then they are relative. Need to convert based on image size.
             annotation = {
                 "image": pdf_image_path,
                 "boxes": annotation_boxes

tools/file_redaction.py CHANGED Viewed

@@ -30,7 +30,7 @@ from tools.file_conversion import process_file, image_dpi, convert_review_json_t
 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
-from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
@@ -100,7 +100,7 @@ def choose_and_run_redactor(file_paths:List[str],
  aws_access_key_textbox:str='',
  aws_secret_key_textbox:str='',
  annotate_max_pages:int=1,
- review_file_state=[],
  output_folder:str=output_folder,
  document_cropboxes:List=[],
  progress=gr.Progress(track_tqdm=True)):
@@ -139,7 +139,8 @@ def choose_and_run_redactor(file_paths:List[str],
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
-    - annotate_max_pages (int, optional): Maximum page value for the annotation object
     - output_folder (str, optional): Output folder for results.
     - document_cropboxes (List, optional): List of document cropboxes for the PDF.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
@@ -150,10 +151,29 @@ def choose_and_run_redactor(file_paths:List[str],
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
     # If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
     if not pymupdf_doc:
         print("Prepared PDF file not found, loading from file")
-        out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages, document_cropboxes)
     #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
@@ -219,7 +239,7 @@ def choose_and_run_redactor(file_paths:List[str],
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
     # If we have reached the last page, return message and outputs
     if current_loop_page >= number_of_pages:
@@ -235,7 +255,7 @@ def choose_and_run_redactor(file_paths:List[str],
             review_out_file_paths.extend(out_review_file_path)
-        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
     # Create allow list
     # If string, assume file path
@@ -306,17 +326,7 @@ def choose_and_run_redactor(file_paths:List[str],
     progress(0.5, desc="Redacting file")
-    if isinstance(file_paths, str):
-        file_paths_list = [os.path.abspath(file_paths)]
-        file_paths_loop = file_paths_list
-    elif isinstance(file_paths, dict):
-        file_paths = file_paths["name"]
-        file_paths_list = [os.path.abspath(file_paths)]
-        file_paths_loop = file_paths_list
-    else:
-        file_paths_list = file_paths
-        file_paths_loop = [file_paths_list[int(latest_file_completed)]]
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
@@ -336,7 +346,7 @@ def choose_and_run_redactor(file_paths:List[str],
             out_message = "No file selected"
             print(out_message)
             raise Exception(out_message)
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
@@ -346,7 +356,7 @@ def choose_and_run_redactor(file_paths:List[str],
             print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
-            pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
              language,
              chosen_redact_entities,
@@ -389,7 +399,7 @@ def choose_and_run_redactor(file_paths:List[str],
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
-            pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number = redact_text_pdf(file_path,
             prepared_pdf_image_paths,language,
             chosen_redact_entities,
             chosen_redact_comprehend_entities,
@@ -416,6 +426,10 @@ def choose_and_run_redactor(file_paths:List[str],
             print(out_message)
             raise Exception(out_message)
         # If at last page, save to file
         if current_loop_page >= number_of_pages:
@@ -437,21 +451,61 @@ def choose_and_run_redactor(file_paths:List[str],
             out_file_paths.append(out_redacted_pdf_file_path)
-            out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
             #logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
             #all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
             #log_files_output_paths.append(logs_output_file_name)
             all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
             out_file_paths.append(all_text_output_file_name)
-            # Save the gradio_annotation_boxes to a review csv file
             try:
-                review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
-                out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
                 review_df.to_csv(out_review_file_path, index=None)
                 out_file_paths.append(out_review_file_path)
@@ -465,7 +519,7 @@ def choose_and_run_redactor(file_paths:List[str],
                 #print("Saving annotations to JSON")
             except Exception as e:
-                print("Could not save annotations to csv file:", e)
             # Make a combined message for the file
             if isinstance(out_message, list):
@@ -486,7 +540,6 @@ def choose_and_run_redactor(file_paths:List[str],
             time_taken = toc - tic
             estimated_time_taken_state = estimated_time_taken_state + time_taken
    # If textract requests made, write to logging file
     if all_request_metadata:
         all_request_metadata_str = '\n'.join(all_request_metadata).strip()
@@ -507,7 +560,7 @@ def choose_and_run_redactor(file_paths:List[str],
     out_file_paths = list(set(out_file_paths))
     review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
-    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''
@@ -714,7 +767,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                 x2 = pymupdf_x2
                 if hasattr(annot, 'text') and annot.text:
-                    img_annotation_box["text"] = annot.text
                 else:
                     img_annotation_box["text"] = ""
@@ -731,12 +784,12 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                 img_annotation_box["ymax"] = annot.top + annot.height
                 img_annotation_box["color"] = (0,0,0)
                 try:
-                    img_annotation_box["label"] = annot.entity_type
                 except:
                     img_annotation_box["label"] = "Redaction"
                 if hasattr(annot, 'text') and annot.text:
-                    img_annotation_box["text"] = annot.text
                 else:
                     img_annotation_box["text"] = ""
@@ -771,7 +824,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
                     img_annotation_box["label"] = str(annot["/T"])
                     if hasattr(annot, 'Contents'):
-                        img_annotation_box["text"] = annot.Contents
                     else:
                         img_annotation_box["text"] = ""
                 else:
@@ -797,7 +850,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
     }
     page.apply_redactions(images=0, graphics=0)
-    page.set_cropbox(original_cropbox)  # Set CropBox to original size
     page.clean_contents()
     return page, out_annotation_boxes
@@ -1006,9 +1059,9 @@ def redact_image_pdf(file_path:str,
     if analysis_type == textract_option and textract_client == "":
-        print("Connection to AWS Textract service unsuccessful.")
-        return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     tic = time.perf_counter()
@@ -1016,7 +1069,7 @@ def redact_image_pdf(file_path:str,
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
-        prepared_pdf_file_paths = process_file(file_path)
     number_of_pages = len(prepared_pdf_file_paths)
     print("Number of pages:", str(number_of_pages))
@@ -1033,21 +1086,10 @@ def redact_image_pdf(file_path:str,
     # If running Textract, check if file already exists. If it does, load in existing data
     if analysis_type == textract_option:
-        json_file_path = output_folder + file_name + "_textract.json"
-        if not os.path.exists(json_file_path):
-            print("No existing Textract results file found.")
-            textract_data = {}
-        else:
-            # Open the file and load the JSON data
-            no_textract_file = False
-            print("Found existing Textract json results file.")
-            if json_file_path not in log_files_output_paths:
-                log_files_output_paths.append(json_file_path)
-            with open(json_file_path, 'r') as json_file:
-                textract_data = json.load(json_file)
     ###
     if current_loop_page == 0: page_loop_start = 0
@@ -1056,6 +1098,7 @@ def redact_image_pdf(file_path:str,
     progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
     original_cropboxes = []
     for page_no in progress_bar:
@@ -1077,7 +1120,8 @@ def redact_image_pdf(file_path:str,
         image_annotations = {"image": image, "boxes": []}
         pymupdf_page = pymupdf_doc.load_page(page_no)
-        original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
         pymupdf_page.set_cropbox(pymupdf_page.mediabox)  # Set CropBox to MediaBox
         if page_no >= page_min and page_no < page_max:
@@ -1085,10 +1129,15 @@ def redact_image_pdf(file_path:str,
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
             page_width, page_height = image.size
             # Possibility to use different languages
             if language == 'en': ocr_lang = 'eng'
             else: ocr_lang = language
@@ -1110,8 +1159,8 @@ def redact_image_pdf(file_path:str,
                     try:
                         text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
-                        if json_file_path not in log_files_output_paths:
-                            log_files_output_paths.append(json_file_path)
                         textract_data = {"pages":[text_blocks]}
                     except Exception as e:
@@ -1170,10 +1219,6 @@ def redact_image_pdf(file_path:str,
             else:
                 redaction_bboxes = []
-            # if analysis_type == tesseract_ocr_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + ".txt"
-            # elif analysis_type == textract_option: interim_results_file_path = output_folder + "interim_analyser_bboxes_" + file_name + "_pages_" + str(page_min + 1) + "_" + str(page_max) + "_textract.txt"
             # # Save decision making process
             # bboxes_str = str(redaction_bboxes)
             # with open(interim_results_file_path, "w") as f:
@@ -1282,17 +1327,17 @@ def redact_image_pdf(file_path:str,
                 if analysis_type == textract_option:
                     # Write the updated existing textract data back to the JSON file
-                    with open(json_file_path, 'w') as json_file:
                         json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-                        if json_file_path not in log_files_output_paths:
-                            log_files_output_paths.append(json_file_path)
-                            print("At end of redact_image_pdf function where time over max.", json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
                 current_loop_page += 1
-                return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
         if is_pdf(file_path) == False:
             images.append(image)
@@ -1317,23 +1362,23 @@ def redact_image_pdf(file_path:str,
             if analysis_type == textract_option:
                 # Write the updated existing textract data back to the JSON file
-                with open(json_file_path, 'w') as json_file:
                     json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-                    if json_file_path not in log_files_output_paths:
-                        log_files_output_paths.append(json_file_path)
-            return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
     if analysis_type == textract_option:
         # Write the updated existing textract data back to the JSON file
-        with open(json_file_path, 'w') as json_file:
             json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
-            if json_file_path not in log_files_output_paths:
-                log_files_output_paths.append(json_file_path)
-    return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
 ###
@@ -1565,11 +1610,13 @@ def redact_text_pdf(
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service not found.")
-        return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
@@ -1600,6 +1647,7 @@ def redact_text_pdf(
     else: page_loop_start = current_loop_page
     original_cropboxes = []
     progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
@@ -1620,7 +1668,7 @@ def redact_text_pdf(
         image_annotations = {"image": image, "boxes": []}
         pymupdf_page = pymupdf_doc.load_page(page_no)
-        original_cropboxes.append(pymupdf_page.cropbox)  # Save original CropBox
         pymupdf_page.set_cropbox(pymupdf_page.mediabox)  # Set CropBox to MediaBox
         if page_min <= page_no < page_max:
@@ -1628,6 +1676,14 @@ def redact_text_pdf(
             if isinstance(image, str):
                 image_path = image
                 image = Image.open(image_path)
             for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
@@ -1749,7 +1805,7 @@ def redact_text_pdf(
                     current_loop_page += 1
-                    return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
         # Check if the image already exists in annotations_all_pages
@@ -1768,7 +1824,7 @@ def redact_text_pdf(
             page_break_return = True
             progress.close(_tqdm=progress_bar)
-            return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
-    return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number

 from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
 from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
+from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
  aws_access_key_textbox:str='',
  aws_secret_key_textbox:str='',
  annotate_max_pages:int=1,
+ review_file_state:pd.DataFrame=[],
  output_folder:str=output_folder,
  document_cropboxes:List=[],
  progress=gr.Progress(track_tqdm=True)):
     - match_fuzzy_whole_phrase_bool (bool, optional): A boolean where 'True' means that the whole phrase is fuzzy matched, and 'False' means that each word is fuzzy matched separately (excluding stop words).
     - aws_access_key_textbox (str, optional): AWS access key for account with Textract and Comprehend permissions.
     - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
+    - annotate_max_pages (int, optional): Maximum page value for the annotation object.
+    - review_file_state (pd.DataFrame, optional): Output review file dataframe.
     - output_folder (str, optional): Output folder for results.
     - document_cropboxes (List, optional): List of document cropboxes for the PDF.
     - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
+    # Choose the correct file to prepare
+    if isinstance(file_paths, str):
+        file_paths_list = [os.path.abspath(file_paths)]
+    elif isinstance(file_paths, dict):
+        file_paths = file_paths["name"]
+        file_paths_list = [os.path.abspath(file_paths)]
+    else:
+        file_paths_list = file_paths
+    valid_extensions = {".pdf", ".jpg", ".jpeg", ".png"}
+    # Filter only files with valid extensions. Currently only allowing one file to be redacted at a time
+    file_paths_list = [list([file for file in file_paths_list if os.path.splitext(file)[1].lower() in valid_extensions])[0]]
+    # If latest_file_completed is used, get the specific file
+    if not isinstance(file_paths, (str, dict)):
+        file_paths_loop = [file_paths_list[int(latest_file_completed)]] if len(file_paths_list) > latest_file_completed else []
+    else:
+        file_paths_loop = file_paths_list
     # If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
     if not pymupdf_doc:
         print("Prepared PDF file not found, loading from file")
+        out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes, page_sizes = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages, document_cropboxes, redact_whole_page_list, output_folder)
     #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
         estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
         print("Estimated total processing time:", str(estimate_total_processing_time))
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
     # If we have reached the last page, return message and outputs
     if current_loop_page >= number_of_pages:
             review_out_file_paths.extend(out_review_file_path)
+        return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
     # Create allow list
     # If string, assume file path
     progress(0.5, desc="Redacting file")
+    # Run through file loop, redact each file at a time
     for file in file_paths_loop:
         if isinstance(file, str):
             file_path = file
             out_message = "No file selected"
             print(out_message)
             raise Exception(out_message)
         if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
             #Analyse and redact image-based pdf or image
             print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
+            pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, page_sizes = redact_image_pdf(file_path,
              prepared_pdf_image_paths,
              language,
              chosen_redact_entities,
             # Analyse text-based pdf
             print('Redacting file as text-based PDF')
+            pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes = redact_text_pdf(file_path,
             prepared_pdf_image_paths,language,
             chosen_redact_entities,
             chosen_redact_comprehend_entities,
             print(out_message)
             raise Exception(out_message)
+        # Output file paths
+        out_orig_pdf_file_path = output_folder + pdf_file_name_with_ext
+        out_review_file_path = out_orig_pdf_file_path + '_review_file.csv'
         # If at last page, save to file
         if current_loop_page >= number_of_pages:
             out_file_paths.append(out_redacted_pdf_file_path)
             #logs_output_file_name = out_orig_pdf_file_path + "_decision_process_output.csv"
             #all_decision_process_table.to_csv(logs_output_file_name, index = None, encoding="utf-8")
             #log_files_output_paths.append(logs_output_file_name)
+            # Convert OCR result bounding boxes to relative values
+            #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
+            #print("page_sizes:", page_sizes)
+            #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
+            page_sizes_df = pd.DataFrame(page_sizes)
+            page_sizes_df["page"] = page_sizes_df["page"].astype(int)
+            all_line_level_ocr_results_df["page"] = all_line_level_ocr_results_df["page"].astype(int)
+            all_line_level_ocr_results_df = all_line_level_ocr_results_df.merge(page_sizes_df, on="page", how="left")
+            all_line_level_ocr_results_df["left"] = all_line_level_ocr_results_df["left"] / all_line_level_ocr_results_df["image_width"]
+            all_line_level_ocr_results_df["width"] = all_line_level_ocr_results_df["width"] / all_line_level_ocr_results_df["image_width"]
+            all_line_level_ocr_results_df["top"] = all_line_level_ocr_results_df["top"] / all_line_level_ocr_results_df["image_height"]
+            all_line_level_ocr_results_df["height"] = all_line_level_ocr_results_df["height"] / all_line_level_ocr_results_df["image_height"]
+            #print("all_line_level_ocr_results_df in choose and run redactor:", all_line_level_ocr_results_df)
             all_text_output_file_name = out_orig_pdf_file_path + "_ocr_output.csv"
             all_line_level_ocr_results_df.to_csv(all_text_output_file_name, index = None, encoding="utf-8")
             out_file_paths.append(all_text_output_file_name)
+            # Save the gradio_annotation_boxes to a review csv file
             try:
+                #print("annotations_all_pages before in choose and run redactor:", annotations_all_pages)
+                #print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
+                #print("page_sizes before in choose and run redactor:", page_sizes)
+                review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table, page_sizes)
+                #print("annotation_all_pages:", annotations_all_pages)
+                #print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
+                #print("review_df after in choose and run redactor:", review_df)
+                review_df["page"] = review_df["page"].astype(int)
+                if "image_height" not in review_df.columns:
+                    review_df = review_df.merge(page_sizes_df, on="page", how="left")
+                # If all coordinates all greater than one, this is a absolute image coordinates - change back to relative coordinates
+                if review_df["xmin"].max() >= 1 and review_df["xmax"].max() >= 1 and review_df["ymin"].max() >= 1 and review_df["ymax"].max() >= 1:
+                    review_df["xmin"] = review_df["xmin"] / review_df["image_width"]
+                    review_df["xmax"] = review_df["xmax"] / review_df["image_width"]
+                    review_df["ymin"] = review_df["ymin"] / review_df["image_height"]
+                    review_df["ymax"] = review_df["ymax"] / review_df["image_height"]
+                # Don't need page sizes in outputs
+                review_df.drop(["image_width", "image_height", "mediabox_width", "mediabox_height", "cropbox_width", "cropbox_height"], axis=1, inplace=True, errors="ignore")
+                #print("review_df:", review_df)
                 review_df.to_csv(out_review_file_path, index=None)
                 out_file_paths.append(out_review_file_path)
                 #print("Saving annotations to JSON")
             except Exception as e:
+                print("Could not save annotations to csv file in choose and run redactor:", e)
             # Make a combined message for the file
             if isinstance(out_message, list):
             time_taken = toc - tic
             estimated_time_taken_state = estimated_time_taken_state + time_taken
    # If textract requests made, write to logging file
     if all_request_metadata:
         all_request_metadata_str = '\n'.join(all_request_metadata).strip()
     out_file_paths = list(set(out_file_paths))
     review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
+    return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes
 def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
     '''
                 x2 = pymupdf_x2
                 if hasattr(annot, 'text') and annot.text:
+                    img_annotation_box["text"] = str(annot.text)
                 else:
                     img_annotation_box["text"] = ""
                 img_annotation_box["ymax"] = annot.top + annot.height
                 img_annotation_box["color"] = (0,0,0)
                 try:
+                    img_annotation_box["label"] = str(annot.entity_type)
                 except:
                     img_annotation_box["label"] = "Redaction"
                 if hasattr(annot, 'text') and annot.text:
+                    img_annotation_box["text"] = str(annot.text)
                 else:
                     img_annotation_box["text"] = ""
                     img_annotation_box["label"] = str(annot["/T"])
                     if hasattr(annot, 'Contents'):
+                        img_annotation_box["text"] = str(annot.Contents)
                     else:
                         img_annotation_box["text"] = ""
                 else:
     }
     page.apply_redactions(images=0, graphics=0)
+    page.set_cropbox = original_cropbox  # Set CropBox to original size
     page.clean_contents()
     return page, out_annotation_boxes
     if analysis_type == textract_option and textract_client == "":
+        out_message = "Connection to AWS Textract service unsuccessful."
+        print(out_message)
+        raise Exception(out_message)
     tic = time.perf_counter()
         out_message = "PDF does not exist as images. Converting pages to image"
         print(out_message)
+        prepared_pdf_file_paths, image_sizes = process_file(file_path)
     number_of_pages = len(prepared_pdf_file_paths)
     print("Number of pages:", str(number_of_pages))
     # If running Textract, check if file already exists. If it does, load in existing data
     if analysis_type == textract_option:
+        textract_json_file_path = output_folder + file_name + "_textract.json"
+        # Usage
+        textract_data, is_missing, log_files_output_paths = load_and_convert_textract_json(textract_json_file_path, log_files_output_paths)
     ###
     if current_loop_page == 0: page_loop_start = 0
     progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
     original_cropboxes = []
+    page_sizes = []
     for page_no in progress_bar:
         image_annotations = {"image": image, "boxes": []}
         pymupdf_page = pymupdf_doc.load_page(page_no)
+        # Set visible page size to biggest size (mediabox) for redaction
+        original_cropboxes.append(pymupdf_page.cropbox.irect)  # Save original CropBox
         pymupdf_page.set_cropbox(pymupdf_page.mediabox)  # Set CropBox to MediaBox
         if page_no >= page_min and page_no < page_max:
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
                 image = Image.open(image)
+            elif not isinstance(image, Image.Image):
+                raise TypeError(f"Unexpected image type: {type(image)}")  # Ensure image is valid
             # Need image size to convert textract OCR outputs to the correct sizes
             page_width, page_height = image.size
+            out_page_image_sizes = {"page":(page_no+1), "image_width":page_width, "image_height":page_height, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
+            page_sizes.append(out_page_image_sizes)
             # Possibility to use different languages
             if language == 'en': ocr_lang = 'eng'
             else: ocr_lang = language
                     try:
                         text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox)  # Analyse page with Textract
+                        if textract_json_file_path not in log_files_output_paths:
+                            log_files_output_paths.append(textract_json_file_path)
                         textract_data = {"pages":[text_blocks]}
                     except Exception as e:
             else:
                 redaction_bboxes = []
             # # Save decision making process
             # bboxes_str = str(redaction_bboxes)
             # with open(interim_results_file_path, "w") as f:
                 if analysis_type == textract_option:
                     # Write the updated existing textract data back to the JSON file
+                    with open(textract_json_file_path, 'w') as json_file:
                         json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                        if textract_json_file_path not in log_files_output_paths:
+                            log_files_output_paths.append(textract_json_file_path)
+                            print("At end of redact_image_pdf function where time over max.", textract_json_file_path, "not found in log_files_output_paths, appended to list:", log_files_output_paths)
                 current_loop_page += 1
+                return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, page_sizes
         if is_pdf(file_path) == False:
             images.append(image)
             if analysis_type == textract_option:
                 # Write the updated existing textract data back to the JSON file
+                with open(textract_json_file_path, 'w') as json_file:
                     json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+                    if textract_json_file_path not in log_files_output_paths:
+                        log_files_output_paths.append(textract_json_file_path)
+            return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, page_sizes
     if analysis_type == textract_option:
         # Write the updated existing textract data back to the JSON file
+        with open(textract_json_file_path, 'w') as json_file:
             json.dump(textract_data, json_file, indent=4)  # indent=4 makes the JSON file pretty-printed
+            if textract_json_file_path not in log_files_output_paths:
+                log_files_output_paths.append(textract_json_file_path)
+    return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number, page_sizes
 ###
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     - progress: Progress tracking object
     '''
+    page_sizes = []
+    out_page_image_sizes = {}
     if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
         print("Connection to AWS Comprehend service not found.")
+        return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes
     # Update custom word list analyser object with any new words that have been added to the custom deny list
     if custom_recogniser_word_list:
     else: page_loop_start = current_loop_page
     original_cropboxes = []
+    page_sizes = []
     progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
         image_annotations = {"image": image, "boxes": []}
         pymupdf_page = pymupdf_doc.load_page(page_no)
+        original_cropboxes.append(pymupdf_page.cropbox.irect)  # Save original CropBox
         pymupdf_page.set_cropbox(pymupdf_page.mediabox)  # Set CropBox to MediaBox
         if page_min <= page_no < page_max:
             if isinstance(image, str):
                 image_path = image
                 image = Image.open(image_path)
+            elif not isinstance(image, Image.Image):
+                raise TypeError(f"Unexpected image type: {type(image)}")  # Ensure image is valid
+            # Need image size to convert textract OCR outputs to the correct sizes
+            page_width, page_height = image.size
+            out_page_image_sizes = {"page":(page_no+1), "image_width":page_width, "image_height":page_height, "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
+            page_sizes.append(out_page_image_sizes)
             for page_layout in extract_pages(filename, page_numbers = [page_no], maxpages=1):
                     current_loop_page += 1
+                    return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes
         # Check if the image already exists in annotations_all_pages
             page_break_return = True
             progress.close(_tqdm=progress_bar)
+            return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes
+    return pymupdf_doc, all_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, page_sizes

tools/helper_functions.py CHANGED Viewed

@@ -34,7 +34,7 @@ aws_pii_detector  = "AWS Comprehend"
 output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
 print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
-session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'True')
 print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
 input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
@@ -60,10 +60,10 @@ def reset_state_vars():
             show_share_button=False,
             show_remove_button=False,
             interactive=False
-        ), [], [], [], pd.DataFrame(), pd.DataFrame(), []
 def reset_review_vars():
-    return [], pd.DataFrame(), pd.DataFrame()
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):

 output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
 print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
+session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
 print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
 input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
             show_share_button=False,
             show_remove_button=False,
             interactive=False
+        ), [], [], pd.DataFrame(), pd.DataFrame(), []
 def reset_review_vars():
+    return pd.DataFrame(), pd.DataFrame()
 def load_in_default_allow_list(allow_list_file_path):
     if isinstance(allow_list_file_path, str):

tools/load_spacy_model_custom_recognisers.py CHANGED Viewed

@@ -11,14 +11,14 @@ import Levenshtein
 import re
 import gradio as gr
-model_name = "en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
 #Load spacy model
 try:
-	import en_core_web_sm
-	nlp = en_core_web_sm.load()
 	print("Successfully imported spaCy model")
 except:

 import re
 import gradio as gr
+model_name = "en_core_web_lg" #"en_core_web_sm" #"en_core_web_trf"
 score_threshold = 0.001
 custom_entities = ["TITLES", "UKPOSTCODE", "STREETNAME", "CUSTOM"]
 #Load spacy model
 try:
+	import en_core_web_lg #en_core_web_sm
+	nlp = en_core_web_lg.load() #en_core_web_sm.load()
 	print("Successfully imported spaCy model")
 except:

tools/redaction_review.py CHANGED Viewed

@@ -7,7 +7,7 @@ import uuid
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
-from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, CUSTOM_BOX_COLOUR
 from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
 import json
@@ -84,56 +84,146 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
     return result
-def get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr):
     recogniser_entities_list = ["Redaction"]
-    recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
-    recogniser_dataframe_out = recogniser_dataframe_gr
     try:
-        review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
-        recogniser_entities = review_dataframe["label"].unique().tolist()
-        recogniser_entities.append("ALL")
-        recogniser_entities_for_drop = sorted(recogniser_entities)
-        recogniser_dataframe_out = gr.Dataframe(review_dataframe)
-        recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_for_drop[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
-        recogniser_entities_list = [entity for entity in recogniser_entities_for_drop if entity != 'Redaction' and entity != 'ALL']  # Remove any existing 'Redaction'
-        recogniser_entities_list.insert(0, 'Redaction')  # Add 'Redaction' to the start of the list
     except Exception as e:
         print("Could not extract recogniser information:", e)
-        recogniser_dataframe_out = recogniser_dataframe_gr
-        recogniser_entities_drop = gr.Dropdown(value="", choices=[""], allow_custom_value=True, interactive=True)
         recogniser_entities_list = ["Redaction"]
-    return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list
-def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True), recogniser_dataframe_gr=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), zoom:int=100):
     '''
-    Update a gradio_image_annotation object with new annotation data
-    '''
     recogniser_entities_list = ["Redaction"]
     recogniser_dataframe_out = pd.DataFrame()
-    if recogniser_dataframe_gr.empty:
-        recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
-    elif recogniser_dataframe_gr.iloc[0,0] == "":
-        recogniser_dataframe_gr, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list = get_recogniser_dataframe_out(image_annotator_object, recogniser_dataframe_gr)
     else:
-        review_dataframe = update_entities_df(recogniser_entities_drop, recogniser_dataframe_gr)
-        recogniser_dataframe_out = gr.Dataframe(review_dataframe)
-        recogniser_entities_list = recogniser_dataframe_gr["label"].unique().tolist()
-        recogniser_entities_list = sorted(recogniser_entities_list)
-        recogniser_entities_list = [entity for entity in recogniser_entities_list if entity != 'Redaction']  # Remove any existing 'Redaction'
-        recogniser_entities_list.insert(0, 'Redaction')  # Add 'Redaction' to the start of the list
     zoom_str = str(zoom) + '%'
     recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
     if not image_annotator_object:
         page_num_reported = 1
@@ -156,9 +246,9 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
         handles_cursor=True,
         interactive=True
     )
-        number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
-        return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
     #print("page_num at start of update_annotator function:", page_num)
@@ -181,9 +271,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
         page_num_reported = page_max_reported
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
@@ -204,11 +292,22 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
         interactive=True
     )
-    number_reported = gr.Number(label = "Page (press enter to change)", value=page_num_reported, precision=0)
-    return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_drop, recogniser_dataframe_out, recogniser_dataframe_gr
-def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_page:int, previous_page:int, all_image_annotations:List[AnnotatedImageData], recogniser_entities_drop=gr.Dropdown(value="ALL", allow_custom_value=True),recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]})), clear_all:bool=False):
     '''
     Overwrite current image annotations with modifications
     '''
@@ -216,43 +315,30 @@ def modify_existing_page_redactions(image_annotated:AnnotatedImageData, current_
     if not current_page:
         current_page = 1
-    #If no previous page or is 0, i.e. first time run, then rewrite current page
-    #if not previous_page:
-    #    previous_page = current_page
-    #print("image_annotated:", image_annotated)
-    image_annotated['image'] = all_image_annotations[previous_page - 1]["image"]
     if clear_all == False:
-        all_image_annotations[previous_page - 1] = image_annotated
     else:
         all_image_annotations[previous_page - 1]["boxes"] = []
-    #print("all_image_annotations:", all_image_annotations)
-    # Rewrite all_image_annotations search dataframe with latest updates
-    try:
-        review_dataframe = convert_review_json_to_pandas_df(all_image_annotations)[["page", "label"]]
-        #print("review_dataframe['label']", review_dataframe["label"])
-        recogniser_entities = review_dataframe["label"].unique().tolist()
-        recogniser_entities.append("ALL")
-        recogniser_entities = sorted(recogniser_entities)
-        recogniser_dataframe_out = gr.Dataframe(review_dataframe)
-        #recogniser_dataframe_gr = gr.Dataframe(review_dataframe)
-        recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_drop, choices=recogniser_entities, allow_custom_value=True, interactive=True)
-    except Exception as e:
-        print("Could not extract recogniser information:", e)
-        recogniser_dataframe_out = recogniser_dataframe
-    return all_image_annotations, current_page, current_page, recogniser_entities_drop, recogniser_dataframe_out
-def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], doc:Document, all_image_annotations:List[AnnotatedImageData], current_page:int, review_file_state, output_folder:str = output_folder, save_pdf:bool=True, progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files
     '''
-    #print("all_image_annotations:", all_image_annotations)
     output_files = []
     output_log_files = []
@@ -260,11 +346,11 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
     #print("File paths in apply_redactions:", file_paths)
-    image_annotated['image'] = all_image_annotations[current_page - 1]["image"]
-    all_image_annotations[current_page - 1] = image_annotated
-    if not image_annotated:
         print("No image annotations found")
         return doc, all_image_annotations
@@ -287,7 +373,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
                 draw = ImageDraw.Draw(image)
-                for img_annotation_box in image_annotated['boxes']:
                     coords = [img_annotation_box["xmin"],
                     img_annotation_box["ymin"],
                     img_annotation_box["xmax"],
@@ -318,6 +404,7 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
                 output_files.append(orig_pdf_file_path)
                 number_of_pages = pdf_doc.page_count
                 print("Saving pages to file.")
@@ -340,8 +427,17 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
                     elif isinstance(image_loc, str):
                         image = Image.open(image_loc)
                     pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
-                    pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
             else:
                 print("File type not recognised.")
@@ -370,31 +466,140 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:List[str], d
             # output_log_files.append(out_annotation_file_path)
             #print("Saving annotations to CSV review file")
-            #print("review_file_state:", review_file_state)
             # Convert json to csv and also save this
-            review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state)
             out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
             review_df.to_csv(out_review_file_file_path, index=None)
             output_files.append(out_review_file_file_path)
         except Exception as e:
-            print("Could not save annotations to csv file:", e)
     return doc, all_image_annotations, output_files, output_log_files
 def get_boxes_json(annotations:AnnotatedImageData):
     return annotations["boxes"]
-def update_entities_df(choice:str, df:pd.DataFrame):
-    if choice=="ALL":
-        return df
-    else:
-        return df.loc[df["label"]==choice,:]
 def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
 def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
@@ -454,7 +659,7 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
         # Load cropbox sizes
         if document_cropboxes:
-            print("Document cropboxes:", document_cropboxes)
             # Extract numbers safely using regex
             match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])

 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
+from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, convert_pandas_df_to_review_json, CUSTOM_BOX_COLOUR
 from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
 from tools.file_redaction import redact_page_with_pymupdf
 import json
     return result
+def update_dropdown_list_based_on_dataframe(df:pd.DataFrame, column:str) -> List["str"]:
+    '''
+    Gather unique elements from a string pandas Series, then append 'ALL' to the start and return the list.
+    '''
+    entities = df[column].astype(str).unique().tolist()
+    entities_for_drop = sorted(entities)
+    entities_for_drop.insert(0, "ALL")
+    return entities_for_drop
+def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:AnnotatedImageData,
+                                 recogniser_dataframe_modified:pd.DataFrame,
+                                 recogniser_dropdown_value:str,
+                                 text_dropdown_value:str,
+                                 page_dropdown_value:str,
+                                 review_df:pd.DataFrame=[],
+                                 page_sizes:List[str]=[]):
+    '''
+    Create a filtered recogniser dataframe and associated dropdowns based on current information in the image annotator and review data frame.
+    '''
     recogniser_entities_list = ["Redaction"]
+    recogniser_dataframe_out = recogniser_dataframe_modified
     try:
+        review_dataframe = convert_review_json_to_pandas_df(image_annotator_object, review_df, page_sizes)
+        print("in get_filtered_recogniser_dataframe_and_dropdowns, recogniser_dropdown_value:", recogniser_dropdown_value)
+        recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
+        recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
+        # This is the choice list for entities when creating a new redaction box
+        recogniser_entities_list = [entity for entity in recogniser_entities_for_drop.copy() if entity != 'Redaction' and entity != 'ALL']  # Remove any existing 'Redaction'
+        recogniser_entities_list.insert(0, 'Redaction')  # Add 'Redaction' to the start of the list
+        text_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "text")
+        text_entities_drop = gr.Dropdown(value=text_dropdown_value, choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
+        page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
+        page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
+        recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
     except Exception as e:
         print("Could not extract recogniser information:", e)
+        recogniser_dataframe_out = recogniser_dataframe_modified[["page", "label", "text"]]
+        recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_dataframe_out["label"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
         recogniser_entities_list = ["Redaction"]
+        text_entities_drop = gr.Dropdown(value=text_dropdown_value, choices=recogniser_dataframe_out["text"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
+        page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=recogniser_dataframe_out["page"].astype(str).unique().tolist(), allow_custom_value=True, interactive=True)
+    return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
+def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
+    '''
+    Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
     '''
     recogniser_entities_list = ["Redaction"]
     recogniser_dataframe_out = pd.DataFrame()
+    if recogniser_dataframe_modified.empty:
+        recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
+    elif recogniser_dataframe_modified.iloc[0,0] == "":
+        recogniser_dataframe_modified, recogniser_dataframe_out, recogniser_entities_dropdown_value, recogniser_entities_list, text_entities_drop, page_entities_drop = get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
     else:
+        print("recogniser dataframe is not empty")
+        review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_modified, page_dropdown_value, text_dropdown_value)
+        recogniser_dataframe_out = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"])
+        recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_modified, "label")
+        recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
+        recogniser_entities_list_base = recogniser_dataframe_modified["label"].astype(str).unique().tolist()
+        # Recogniser entities list is the list of choices that appear when you make a new redaction box
+        recogniser_entities_list = [entity for entity in recogniser_entities_list_base if entity != 'Redaction']
+        recogniser_entities_list.insert(0, 'Redaction')
+    return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
+def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
+    return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
+def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows_df: pd.DataFrame, image_file_paths:List[str], page_sizes:List[dict], image_annotations_state:dict, recogniser_entity_dataframe_base:pd.DataFrame):
+    '''
+    Remove selected items from the review dataframe from the annotation object and review dataframe.
+    '''
+    backup_review_state = review_df
+    backup_image_annotations_state = image_annotations_state
+    backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
+    if not selected_rows_df.empty and not review_df.empty:
+        # Ensure selected_rows_df has the same relevant columns
+        selected_subset = selected_rows_df[['label', 'page', 'text']].drop_duplicates()
+        # Perform anti-join using merge with an indicator column
+        merged_df = review_df.merge(selected_subset, on=['label', 'page', 'text'], how='left', indicator=True)
+        # Keep only the rows that do not have a match in selected_rows_df
+        out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
+        out_image_annotations_state = convert_pandas_df_to_review_json(out_review_df, image_file_paths, page_sizes)
+        recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
+    else:
+        out_review_df = review_df
+        recogniser_entity_dataframe_base = pd.DataFrame()
+        out_image_annotations_state = {}
+    return out_review_df, out_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
+def update_annotator(image_annotator_object:AnnotatedImageData,
+                     page_num:int,
+                     recogniser_entities_dropdown_value:str="ALL",
+                     page_dropdown_value:str="ALL",
+                     text_dropdown_value:str="ALL",
+                     recogniser_dataframe_modified=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"]), zoom:int=100,
+                     review_df:pd.DataFrame=[],
+                     page_sizes:List[dict]=[]):
+    '''
+    Update a gradio_image_annotation object with new annotation data.
+    '''
+    # First, update the dataframe containing the found recognisers
+    recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_entities_drop, page_entities_drop = update_recogniser_dataframes(image_annotator_object, recogniser_dataframe_modified, recogniser_entities_dropdown_value, text_dropdown_value, page_dropdown_value, review_df, page_sizes)
+    #print("Creating output annotator object in update_annotator function")
     zoom_str = str(zoom) + '%'
     recogniser_colour_list = [(0, 0, 0) for _ in range(len(recogniser_entities_list))]
+    #print("recogniser_entities_list:", recogniser_entities_list)
+    #print("recogniser_colour_list:", recogniser_colour_list)
+    #print("zoom_str:", zoom_str)
     if not image_annotator_object:
         page_num_reported = 1
         handles_cursor=True,
         interactive=True
     )
+        number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
+        return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out, recogniser_dataframe_modified, text_entities_drop, page_entities_drop
     #print("page_num at start of update_annotator function:", page_num)
         page_num_reported = page_max_reported
     image_annotator_object = remove_duplicate_images_with_blank_boxes(image_annotator_object)
     out_image_annotator = image_annotator(
         value = image_annotator_object[page_num_reported - 1],
         boxes_alpha=0.1,
         interactive=True
     )
+    number_reported = gr.Number(label = "Current page", value=page_num_reported, precision=0)
+    return out_image_annotator, number_reported, number_reported, page_num_reported, recogniser_entities_dropdown_value, recogniser_dataframe_out, recogniser_dataframe_modified, text_entities_drop, page_entities_drop
+def modify_existing_page_redactions(image_annotator_object:AnnotatedImageData,
+                                    current_page:int,
+                                    previous_page:int,
+                                    all_image_annotations:List[AnnotatedImageData],
+                                    recogniser_entities_dropdown_value="ALL",
+                                    text_dropdown_value="ALL",
+                                    page_dropdown_value="ALL",
+                                    recogniser_dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"]),
+                                    review_dataframe:pd.DataFrame=[],
+                                    page_sizes:List[dict]=[],
+                                    clear_all:bool=False
+                                    ):
     '''
     Overwrite current image annotations with modifications
     '''
     if not current_page:
         current_page = 1
+    print("in modify_existing_page_redactions - recogniser_entities_dropdown_value:", recogniser_entities_dropdown_value)
+    image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
     if clear_all == False:
+        all_image_annotations[previous_page - 1] = image_annotator_object
     else:
         all_image_annotations[previous_page - 1]["boxes"] = []
+    return all_image_annotations, current_page, current_page
+def apply_redactions(image_annotator_object:AnnotatedImageData,
+                     file_paths:List[str],
+                     doc:Document,
+                     all_image_annotations:List[AnnotatedImageData],
+                     current_page:int,
+                     review_file_state:pd.DataFrame,
+                     output_folder:str = output_folder,
+                     save_pdf:bool=True,
+                     page_sizes:List[dict]=[],
+                     progress=gr.Progress(track_tqdm=True)):
     '''
     Apply modified redactions to a pymupdf and export review files
     '''
     output_files = []
     output_log_files = []
     #print("File paths in apply_redactions:", file_paths)
+    image_annotator_object['image'] = all_image_annotations[current_page - 1]["image"]
+    all_image_annotations[current_page - 1] = image_annotator_object
+    if not image_annotator_object:
         print("No image annotations found")
         return doc, all_image_annotations
                 draw = ImageDraw.Draw(image)
+                for img_annotation_box in image_annotator_object['boxes']:
                     coords = [img_annotation_box["xmin"],
                     img_annotation_box["ymin"],
                     img_annotation_box["xmax"],
                 output_files.append(orig_pdf_file_path)
                 number_of_pages = pdf_doc.page_count
+                original_cropboxes = []
                 print("Saving pages to file.")
                     elif isinstance(image_loc, str):
                         image = Image.open(image_loc)
+                    #print("all_image_annotations for page:", all_image_annotations[i])
+                    #print("image:", image)
                     pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
+                    original_cropboxes.append(pymupdf_page.cropbox.irect)
+                    pymupdf_page.set_cropbox = pymupdf_page.mediabox
+                    #print("pymupdf_page:", pymupdf_page)
+                    # print("original_cropboxes:", original_cropboxes)
+                    pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1])
             else:
                 print("File type not recognised.")
             # output_log_files.append(out_annotation_file_path)
             #print("Saving annotations to CSV review file")
+            #print("all_image_annotations before conversion in apply redactions:", all_image_annotations)
+            #print("review_file_state before conversion in apply redactions:", review_file_state)
+            #print("page_sizes before conversion in apply redactions:", page_sizes)
             # Convert json to csv and also save this
+            review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state, page_sizes=page_sizes)
             out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
+            print("Saving review file after convert_review_json function in apply redactions")
             review_df.to_csv(out_review_file_file_path, index=None)
             output_files.append(out_review_file_file_path)
         except Exception as e:
+            print("In apply redactions function, could not save annotations to csv file:", e)
     return doc, all_image_annotations, output_files, output_log_files
 def get_boxes_json(annotations:AnnotatedImageData):
     return annotations["boxes"]
+def update_entities_df_recogniser_entities(choice:str, df:pd.DataFrame, page_dropdown_value:str, text_dropdown_value:str):
+    '''
+    Update the rows in a dataframe depending on the user choice from a dropdown
+    '''
+    if isinstance(choice, str):
+        choice = [choice]
+    if isinstance(page_dropdown_value, str):
+        page_dropdown_value = [page_dropdown_value]
+    if isinstance(text_dropdown_value, str):
+        text_dropdown_value = [text_dropdown_value]
+    filtered_df = df.copy()
+    # Apply filtering based on dropdown selections
+    if not "ALL" in page_dropdown_value:
+        filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
+    if not "ALL" in text_dropdown_value:
+        filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
+    if not "ALL" in choice:
+        filtered_df = filtered_df[filtered_df["label"].astype(str).isin(choice)]
+    recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
+    recogniser_entities_drop = gr.Dropdown(value=choice[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
+    text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
+    text_entities_drop = gr.Dropdown(value=text_dropdown_value[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
+    page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
+    page_entities_drop = gr.Dropdown(value=page_dropdown_value[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
+    return filtered_df, text_entities_drop, page_entities_drop
+def update_entities_df_page(choice:str, df:pd.DataFrame, label_dropdown_value:str, text_dropdown_value:str):
+    '''
+    Update the rows in a dataframe depending on the user choice from a dropdown
+    '''
+    if isinstance(choice, str):
+        choice = [choice]
+    if isinstance(label_dropdown_value, str):
+        label_dropdown_value = [label_dropdown_value]
+    if isinstance(text_dropdown_value, str):
+        text_dropdown_value = [text_dropdown_value]
+    filtered_df = df.copy()
+    # Apply filtering based on dropdown selections
+    if not "ALL" in text_dropdown_value:
+        filtered_df = filtered_df[filtered_df["text"].astype(str).isin(text_dropdown_value)]
+    if not "ALL" in label_dropdown_value:
+        filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
+    if not "ALL" in choice:
+        filtered_df = filtered_df[filtered_df["page"].astype(str).isin(choice)]
+    recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
+    recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
+    text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
+    text_entities_drop = gr.Dropdown(value=text_dropdown_value[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
+    page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
+    page_entities_drop = gr.Dropdown(value=choice[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
+    return filtered_df, recogniser_entities_drop, text_entities_drop
+def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:str, page_dropdown_value:str):
+    '''
+    Update the rows in a dataframe depending on the user choice from a dropdown
+    '''
+    if isinstance(choice, str):
+        choice = [choice]
+    if isinstance(label_dropdown_value, str):
+        label_dropdown_value = [label_dropdown_value]
+    if isinstance(page_dropdown_value, str):
+        page_dropdown_value = [page_dropdown_value]
+    filtered_df = df.copy()
+    # Apply filtering based on dropdown selections
+    if not "ALL" in page_dropdown_value:
+        filtered_df = filtered_df[filtered_df["page"].astype(str).isin(page_dropdown_value)]
+    if not "ALL" in label_dropdown_value:
+        filtered_df = filtered_df[filtered_df["label"].astype(str).isin(label_dropdown_value)]
+    if not "ALL" in choice:
+        filtered_df = filtered_df[filtered_df["text"].astype(str).isin(choice)]
+    recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "label")
+    recogniser_entities_drop = gr.Dropdown(value=label_dropdown_value[0], choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
+    text_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "text")
+    text_entities_drop = gr.Dropdown(value=choice[0], choices=text_entities_for_drop, allow_custom_value=True, interactive=True)
+    page_entities_for_drop = update_dropdown_list_based_on_dataframe(filtered_df, "page")
+    page_entities_drop = gr.Dropdown(value=page_dropdown_value[0], choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
+    return filtered_df, recogniser_entities_drop, page_entities_drop
+def reset_dropdowns():
+    return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
 def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
+        print("evt.row_value[0]:", evt.row_value[0])
         row_value_page = evt.row_value[0] # This is the page number value
+        if isinstance(row_value_page, list):
+            row_value_page = row_value_page[0]
+        print("row_value_page:", row_value_page)
         return row_value_page
 def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, image_width:float, image_height:float, x1:float, y1:float, x2:float, y2:float):
         # Load cropbox sizes
         if document_cropboxes:
+            #print("Document cropboxes:", document_cropboxes)
             # Extract numbers safely using regex
             match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])