seanpedrickcase commited on
Commit
6319afc
·
1 Parent(s): 66e145d

More config options. Fixed some bugs with removing elements from review page and Adobe export. Some UI rearrangements

Browse files
app.py CHANGED
@@ -10,10 +10,11 @@ from datetime import datetime
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
- from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, output_folder, get_or_create_env_var, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
 
14
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
15
  from tools.file_redaction import choose_and_run_redactor
16
- from tools.file_conversion import prepare_image_or_pdf, get_input_file_names, CUSTOM_BOX_COLOUR
17
  from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
18
  from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
@@ -142,9 +143,6 @@ with app:
142
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
143
 
144
  ## Settings page variables
145
- default_allow_list_file_name = "default_allow_list.csv"
146
- default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
147
-
148
  default_deny_list_file_name = "default_deny_list.csv"
149
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
150
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
@@ -155,7 +153,11 @@ with app:
155
 
156
  # S3 settings for default allow list load
157
  s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
158
- s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=default_allow_list_file_name, visible=False)
 
 
 
 
159
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
160
 
161
  # Base dataframe for recognisers that is not modified subsequent to load
@@ -185,7 +187,7 @@ with app:
185
  ###
186
  with gr.Tab("Redact PDFs/images"):
187
  with gr.Accordion("Redact document", open = True):
188
- in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
189
  # if RUN_AWS_FUNCTIONS == "1":
190
  in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
191
  pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
@@ -217,18 +219,16 @@ with app:
217
  ###
218
  with gr.Tab("Review redactions", id="tab_object_annotation"):
219
 
220
- with gr.Accordion(label = "Review redaction file", open=True):
221
- output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
222
- upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)", variant="primary")
223
- with gr.Row():
224
- annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
225
  with gr.Row():
226
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
227
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
228
  with gr.Row():
229
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
230
 
231
- with gr.Row():
232
  with gr.Column(scale=2):
233
  with gr.Row(equal_height=True):
234
  annotation_last_page_button = gr.Button("Previous page", scale = 4)
@@ -236,7 +236,8 @@ with app:
236
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
237
  annotation_next_page_button = gr.Button("Next page", scale = 4)
238
  with gr.Column(scale=1):
239
- blank_markdown_top = gr.Markdown(value="", label="")
 
240
 
241
  with gr.Row():
242
  with gr.Column(scale=2):
@@ -261,12 +262,12 @@ with app:
261
  interactive=False
262
  )
263
  with gr.Column(scale=1):
264
- with gr.Row():
265
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
266
  page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
267
  text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
268
  recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
269
- with gr.Row():
270
  reset_dropdowns_btn = gr.Button(value="Reset filters")
271
  exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
272
  undo_last_removal_btn = gr.Button(value="Undo last element removal")
@@ -393,21 +394,22 @@ with app:
393
  ###
394
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
395
 
 
396
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
397
- success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
398
- outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes], api_name="redact_doc").\
399
  success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
400
 
401
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
402
- # current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
403
- # outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes]).\
404
- # success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
405
 
406
  # If a file has been completed, the function will continue onto the next document
407
- # latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
408
- # outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes]).\
409
- # success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
410
- # success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
411
 
412
  ###
413
  # REVIEW PDF REDACTIONS
@@ -479,8 +481,9 @@ with app:
479
  success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
480
 
481
  exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
482
- success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])#.\
483
- #success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
 
484
 
485
  undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
486
  success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
@@ -488,7 +491,7 @@ with app:
488
  # Convert review file to xfdf Adobe format
489
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
490
  success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
491
- success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
492
 
493
  # Convert xfdf Adobe file back to review_file.csv
494
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
@@ -533,14 +536,14 @@ with app:
533
  # Get connection details on app load
534
  app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
535
 
536
- # If running on AWS, load in the default allow list file from S3
537
- # if RUN_AWS_FUNCTIONS == "1":
538
- # print("default_allow_list_output_folder_location:", default_allow_list_loc)
539
- # if not os.path.exists(default_allow_list_loc):
540
- # app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
541
- # success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
542
- # else:
543
- # app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
544
 
545
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
546
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
@@ -566,27 +569,7 @@ with app:
566
  latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
567
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
568
 
569
- # Get some environment variables and Launch the Gradio app
570
- COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
571
- print(f'The value of COGNITO_AUTH is {COGNITO_AUTH}')
572
- 1
573
- RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
574
- print(f'The value of RUN_DIRECT_MODE is {RUN_DIRECT_MODE}')
575
-
576
- MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
577
- print(f'The value of RUN_DIRECT_MODE is {MAX_QUEUE_SIZE}')
578
-
579
- MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
580
- print(f'The value of MAX_FILE_SIZE is {MAX_FILE_SIZE}')
581
-
582
- GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
583
- print(f'The value of GRADIO_SERVER_PORT is {GRADIO_SERVER_PORT}')
584
-
585
- ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
586
- print(f'The value of ROOT_PATH is {ROOT_PATH}')
587
 
588
- DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
589
- print(f'The value of DEFAULT_CONCURRENCY_LIMIT is {DEFAULT_CONCURRENCY_LIMIT}')
590
 
591
  if __name__ == "__main__":
592
 
 
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
 
13
+ from tools.config import output_folder, RUN_DIRECT_MODE, MAX_QUEUE_SIZE, DEFAULT_CONCURRENCY_LIMIT, MAX_FILE_SIZE, GRADIO_SERVER_PORT, ROOT_PATH, GET_DEFAULT_ALLOW_LIST, DEFAULT_ALLOW_LIST_PATH
14
+ from tools.helper_functions import ensure_output_folder_exists, add_folder_to_path, put_columns_in_df, get_connection_params, reveal_feedback_buttons, custom_regex_load, reset_state_vars, load_in_default_allow_list, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, reset_review_vars, merge_csv_files, load_all_output_files
15
  from tools.aws_functions import upload_file_to_s3, download_file_from_s3, RUN_AWS_FUNCTIONS, bucket_name
16
  from tools.file_redaction import choose_and_run_redactor
17
+ from tools.file_conversion import prepare_image_or_pdf, get_input_file_names
18
  from tools.redaction_review import apply_redactions, modify_existing_page_redactions, decrease_page, increase_page, update_annotator, update_zoom, update_entities_df_recogniser_entities, update_entities_df_page, update_entities_df_text, df_select_callback, convert_df_to_xfdf, convert_xfdf_to_dataframe, reset_dropdowns, exclude_selected_items_from_redaction, undo_last_removal
19
  from tools.data_anonymise import anonymise_data_files
20
  from tools.auth import authenticate_user
 
143
  prepare_for_review_bool = gr.Checkbox(label="prepare_for_review_bool", value=True, visible=False)
144
 
145
  ## Settings page variables
 
 
 
146
  default_deny_list_file_name = "default_deny_list.csv"
147
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
148
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
 
153
 
154
  # S3 settings for default allow list load
155
  s3_default_bucket = gr.Textbox(label = "Default S3 bucket", value=bucket_name, visible=False)
156
+
157
+ default_allow_list_file_name = "default_allow_list.csv"
158
+ default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
159
+
160
+ s3_default_allow_list_file = gr.Textbox(label = "Default allow list file", value=DEFAULT_ALLOW_LIST_PATH, visible=False)
161
  default_allow_list_output_folder_location = gr.Textbox(label = "Output default allow list location", value=default_allow_list_loc, visible=False)
162
 
163
  # Base dataframe for recognisers that is not modified subsequent to load
 
187
  ###
188
  with gr.Tab("Redact PDFs/images"):
189
  with gr.Accordion("Redact document", open = True):
190
+ in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "multiple", file_types=['.pdf', '.jpg', '.png', '.json', '.zip'], height=file_input_height)
191
  # if RUN_AWS_FUNCTIONS == "1":
192
  in_redaction_method = gr.Radio(label="Choose text extraction method. AWS Textract has a cost per page - $3.50 per 1,000 pages with signature detection (default), $1.50 without. Go to Redaction settings - AWS Textract options to remove signature detection.", value = default_ocr_val, choices=[text_ocr_option, tesseract_ocr_option, textract_option])
193
  pii_identification_method_drop = gr.Radio(label = "Choose PII detection method. AWS Comprehend has a cost of approximately $0.01 per 10,000 characters.", value = default_pii_detector, choices=[local_pii_detector, aws_pii_detector])
 
219
  ###
220
  with gr.Tab("Review redactions", id="tab_object_annotation"):
221
 
222
+ with gr.Accordion(label = "Review PDF redactions", open=True):
223
+ output_review_files = gr.File(label="Upload original PDF and 'review_file' csv here to review suggested redactions", file_count='multiple', height=file_input_height)
224
+ upload_previous_review_file_btn = gr.Button("Review PDF and 'review file' csv provided above", variant="primary")
 
 
225
  with gr.Row():
226
  annotate_zoom_in = gr.Button("Zoom in", visible=False)
227
  annotate_zoom_out = gr.Button("Zoom out", visible=False)
228
  with gr.Row():
229
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
230
 
231
+ with gr.Row(equal_height=True):
232
  with gr.Column(scale=2):
233
  with gr.Row(equal_height=True):
234
  annotation_last_page_button = gr.Button("Previous page", scale = 4)
 
236
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
237
  annotation_next_page_button = gr.Button("Next page", scale = 4)
238
  with gr.Column(scale=1):
239
+ annotation_button_apply = gr.Button("Apply revised redactions to PDF", variant="secondary")
240
+ #blank_markdown_top = gr.Markdown(value="", label="")
241
 
242
  with gr.Row():
243
  with gr.Column(scale=2):
 
262
  interactive=False
263
  )
264
  with gr.Column(scale=1):
265
+ with gr.Row(equal_height=True):
266
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
267
  page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
268
  text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
269
  recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"])
270
+ with gr.Row(equal_height=True):
271
  reset_dropdowns_btn = gr.Button(value="Reset filters")
272
  exclude_selected_btn = gr.Button(value="Exclude items in table from redactions")
273
  undo_last_removal_btn = gr.Button(value="Undo last element removal")
 
394
  ###
395
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
396
 
397
+ # Run redaction function
398
  document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
399
+ success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
400
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes], api_name="redact_doc").\
401
  success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
402
 
403
  # If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
404
+ current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
405
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes]).\
406
+ success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
407
 
408
  # If a file has been completed, the function will continue onto the next document
409
+ latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes],
410
+ outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, document_cropboxes]).\
411
+ success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
412
+ success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
413
 
414
  ###
415
  # REVIEW PDF REDACTIONS
 
481
  success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
482
 
483
  exclude_selected_btn.click(exclude_selected_items_from_redaction, inputs=[review_file_state, recogniser_entity_dataframe, images_pdf_state, page_sizes, all_image_annotations_state, recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base]).\
484
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown]).\
485
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
486
+ # success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown, recogniser_entity_dataframe_base, review_file_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
487
 
488
  undo_last_removal_btn.click(undo_last_removal, inputs=[backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base], outputs=[review_file_state, all_image_annotations_state, recogniser_entity_dataframe_base]).\
489
  success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown])
 
491
  # Convert review file to xfdf Adobe format
492
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
493
  success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool, in_fully_redacted_list_state, output_folder_textbox], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes, page_sizes]).\
494
+ success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes, page_sizes], outputs=[adobe_review_files_out])
495
 
496
  # Convert xfdf Adobe file back to review_file.csv
497
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
 
536
  # Get connection details on app load
537
  app.load(get_connection_params, inputs=[output_folder_textbox], outputs=[session_hash_state, output_folder_textbox, session_hash_textbox])
538
 
539
+ # If relevant environment variable is set, load in the default allow list file from S3
540
+ if GET_DEFAULT_ALLOW_LIST == "True" and DEFAULT_ALLOW_LIST_PATH:
541
+ print("Loading allow list from default_allow_list_output_folder_location:", default_allow_list_loc)
542
+ if not os.path.exists(default_allow_list_loc):
543
+ app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
544
+ success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
545
+ else:
546
+ app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
547
 
548
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
549
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
 
569
  latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
570
  success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
571
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
 
 
 
573
 
574
  if __name__ == "__main__":
575
 
requirements.txt CHANGED
@@ -13,7 +13,7 @@ spacy==3.8.4
13
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
  gradio==5.22.0
16
- boto3==1.36.26
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
 
13
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
14
  #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
15
  gradio==5.22.0
16
+ boto3==1.37.17
17
  pyarrow==19.0.1
18
  openpyxl==3.1.5
19
  Faker==36.1.1
tools/auth.py CHANGED
@@ -1,32 +1,12 @@
1
-
2
- import os
3
  import boto3
4
- import gradio as gr
5
  import hmac
6
  import hashlib
7
  import base64
 
8
 
9
- def get_or_create_env_var(var_name, default_value):
10
- # Get the environment variable if it exists
11
- value = os.environ.get(var_name)
12
-
13
- # If it doesn't exist, set it to the default value
14
- if value is None:
15
- os.environ[var_name] = default_value
16
- value = default_value
17
-
18
- return value
19
-
20
- client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
21
- #print(f'The value of AWS_CLIENT_ID is {client_id}')
22
-
23
- client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
24
- #print(f'The value of AWS_CLIENT_SECRET is {client_secret}')
25
-
26
- user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
27
- #print(f'The value of AWS_USER_POOL_ID is {user_pool_id}')
28
-
29
- def calculate_secret_hash(client_id, client_secret, username):
30
  message = username + client_id
31
  dig = hmac.new(
32
  str(client_secret).encode('utf-8'),
 
1
+ #import os
 
2
  import boto3
3
+ #import gradio as gr
4
  import hmac
5
  import hashlib
6
  import base64
7
+ from tools.config import client_id, client_secret, user_pool_id
8
 
9
+ def calculate_secret_hash(client_id:str, client_secret:str, username:str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  message = username + client_id
11
  dig = hmac.new(
12
  str(client_secret).encode('utf-8'),
tools/aws_functions.py CHANGED
@@ -3,37 +3,13 @@ import pandas as pd
3
  import boto3
4
  import tempfile
5
  import os
6
- from tools.helper_functions import get_or_create_env_var
7
- from dotenv import load_dotenv
8
 
9
  PandasDataFrame = Type[pd.DataFrame]
10
 
11
  # Get AWS credentials
12
- bucket_name=""
13
-
14
- RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
15
- print(f'The value of RUN_AWS_FUNCTIONS is {RUN_AWS_FUNCTIONS}')
16
-
17
- AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
18
- print(f'The value of AWS_REGION is {AWS_REGION}')
19
-
20
- # If you have an aws_config env file in the config folder, you can load in AWS keys this way
21
- AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '/env/aws_config.env')
22
- print(f'The value of AWS_CONFIG_PATH is {AWS_CONFIG_PATH}')
23
-
24
- if os.path.exists(AWS_CONFIG_PATH):
25
- print("Loading AWS keys from config folder")
26
- load_dotenv(AWS_CONFIG_PATH)
27
-
28
- AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
29
- if AWS_ACCESS_KEY:
30
- print(f'AWS_ACCESS_KEY found in environment variables')
31
-
32
- AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
33
- if AWS_SECRET_KEY:
34
- print(f'AWS_SECRET_KEY found in environment variables')
35
-
36
-
37
 
38
  def get_assumed_role_info():
39
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
@@ -49,14 +25,11 @@ def get_assumed_role_info():
49
  return assumed_role_arn, assumed_role_name
50
 
51
  if RUN_AWS_FUNCTIONS == "1":
52
- try:
53
- bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
54
  session = boto3.Session()
55
-
56
- #print("session:", session)
57
 
58
  except Exception as e:
59
- print("Could not start boto3 session:", e)
60
 
61
  try:
62
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
 
3
  import boto3
4
  import tempfile
5
  import os
6
+ from tools.config import AWS_REGION, RUN_AWS_FUNCTIONS, DOCUMENT_REDACTION_BUCKET
7
+
8
 
9
  PandasDataFrame = Type[pd.DataFrame]
10
 
11
  # Get AWS credentials
12
+ bucket_name = DOCUMENT_REDACTION_BUCKET
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def get_assumed_role_info():
15
  sts_endpoint = 'https://sts.' + AWS_REGION + '.amazonaws.com'
 
25
  return assumed_role_arn, assumed_role_name
26
 
27
  if RUN_AWS_FUNCTIONS == "1":
28
+ try:
 
29
  session = boto3.Session()
 
 
30
 
31
  except Exception as e:
32
+ print("Could not start boto3 session:", e)
33
 
34
  try:
35
  assumed_role_arn, assumed_role_name = get_assumed_role_info()
tools/aws_textract.py CHANGED
@@ -1,5 +1,4 @@
1
  import boto3
2
- #from PIL import Image
3
  from typing import List
4
  import io
5
  import os
@@ -7,12 +6,10 @@ import json
7
  from collections import defaultdict
8
  import pikepdf
9
  import time
10
- # Example: converting this single page to an image
11
- #from pdf2image import convert_from_bytes
12
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
13
- from tools.aws_functions import AWS_ACCESS_KEY, AWS_SECRET_KEY
14
 
15
- def extract_textract_metadata(response):
16
  """Extracts metadata from an AWS Textract response."""
17
 
18
  #print("Document metadata:", response['DocumentMetadata'])
@@ -83,8 +80,7 @@ def analyse_page_with_textract(pdf_page_bytes:object, page_no:int, client:str=""
83
  # Return a list containing the wrapped response and the metadata
84
  return wrapped_response, request_metadata # Return as a list to match the desired structure
85
 
86
-
87
- def convert_pike_pdf_page_to_bytes(pdf, page_num):
88
  # Create a new empty PDF
89
  new_pdf = pikepdf.Pdf.new()
90
 
@@ -109,8 +105,7 @@ def convert_pike_pdf_page_to_bytes(pdf, page_num):
109
 
110
  return pdf_bytes
111
 
112
-
113
- def json_to_ocrresult(json_data, page_width, page_height, page_no):
114
  '''
115
  Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
116
  '''
@@ -274,7 +269,7 @@ def json_to_ocrresult(json_data, page_width, page_height, page_no):
274
 
275
  return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
276
 
277
- def load_and_convert_textract_json(textract_json_file_path, log_files_output_paths):
278
  """
279
  Loads Textract JSON from a file, detects if conversion is needed,
280
  and converts if necessary.
@@ -317,8 +312,6 @@ def load_and_convert_textract_json(textract_json_file_path, log_files_output_pat
317
  print("textract data:", textract_data)
318
  return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
319
 
320
-
321
-
322
  # Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
323
  def restructure_textract_output(textract_output:object):
324
  '''
 
1
  import boto3
 
2
  from typing import List
3
  import io
4
  import os
 
6
  from collections import defaultdict
7
  import pikepdf
8
  import time
 
 
9
  from tools.custom_image_analyser_engine import OCRResult, CustomImageRecognizerResult
10
+ from tools.config import AWS_ACCESS_KEY, AWS_SECRET_KEY
11
 
12
+ def extract_textract_metadata(response:object):
13
  """Extracts metadata from an AWS Textract response."""
14
 
15
  #print("Document metadata:", response['DocumentMetadata'])
 
80
  # Return a list containing the wrapped response and the metadata
81
  return wrapped_response, request_metadata # Return as a list to match the desired structure
82
 
83
+ def convert_pike_pdf_page_to_bytes(pdf:object, page_num:int):
 
84
  # Create a new empty PDF
85
  new_pdf = pikepdf.Pdf.new()
86
 
 
105
 
106
  return pdf_bytes
107
 
108
+ def json_to_ocrresult(json_data:dict, page_width:float, page_height:float, page_no:int):
 
109
  '''
110
  Convert the json response from textract to the OCRResult format used elsewhere in the code. Looks for lines, words, and signatures. Handwriting and signatures are set aside especially for later in case the user wants to override the default behaviour and redact all handwriting/signatures.
111
  '''
 
269
 
270
  return all_ocr_results, signature_or_handwriting_recogniser_results, signature_recogniser_results, handwriting_recogniser_results, ocr_results_with_children
271
 
272
+ def load_and_convert_textract_json(textract_json_file_path:str, log_files_output_paths:str):
273
  """
274
  Loads Textract JSON from a file, detects if conversion is needed,
275
  and converts if necessary.
 
312
  print("textract data:", textract_data)
313
  return {}, True, log_files_output_paths # Return empty data if JSON is not recognized
314
 
 
 
315
  # Load Textract JSON output (assuming it's stored in a variable called `textract_output`)
316
  def restructure_textract_output(textract_output:object):
317
  '''
tools/cli_redact.py CHANGED
@@ -1,12 +1,13 @@
1
  import argparse
2
  import os
3
- from tools.helper_functions import ensure_output_folder_exists, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
 
4
  from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
5
  from tools.file_redaction import choose_and_run_redactor
6
  import pandas as pd
7
  from datetime import datetime
8
 
9
- chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
10
  'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
11
  'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
12
  'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
 
1
  import argparse
2
  import os
3
+ from tools.config import get_or_create_env_var
4
+ from tools.helper_functions import ensure_output_folder_exists,tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
5
  from tools.file_conversion import get_input_file_names, prepare_image_or_pdf
6
  from tools.file_redaction import choose_and_run_redactor
7
  import pandas as pd
8
  from datetime import datetime
9
 
10
+ chosen_comprehend_entities = ['BANK_ACCOUNT_NUMBER','BANK_ROUTING','CREDIT_DEBIT_NUMBER', 'CREDIT_DEBIT_CVV', 'CREDIT_DEBIT_EXPIRY','PIN','EMAIL','ADDRESS',
11
  'NAME','PHONE', 'PASSPORT_NUMBER','DRIVER_ID', 'USERNAME','PASSWORD',
12
  'IP_ADDRESS','MAC_ADDRESS','LICENSE_PLATE',
13
  'VEHICLE_IDENTIFICATION_NUMBER','UK_NATIONAL_INSURANCE_NUMBER',
tools/config.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Set or retrieve configuration variables for the redaction app
5
+
6
+ def get_or_create_env_var(var_name:str, default_value:str, print_val:bool=False):
7
+ '''
8
+ Get an environmental variable, and set it to a default value if it doesn't exist
9
+ '''
10
+ # Get the environment variable if it exists
11
+ value = os.environ.get(var_name)
12
+
13
+ # If it doesn't exist, set the environment variable to the default value
14
+ if value is None:
15
+ os.environ[var_name] = default_value
16
+ value = default_value
17
+
18
+ if print_val == True:
19
+ print(f'The value of {var_name} is {value}')
20
+
21
+ return value
22
+
23
+
24
+ # If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. '/env/app_config.env'
25
+ APP_CONFIG_PATH = get_or_create_env_var('APP_CONFIG_PATH', '', print_val=True)
26
+
27
+
28
+ if os.path.exists(APP_CONFIG_PATH):
29
+ print(f"Loading APP variables from config file {APP_CONFIG_PATH}")
30
+ load_dotenv(APP_CONFIG_PATH)
31
+
32
+ ###
33
+ # AWS CONFIG
34
+ ###
35
+
36
+ # If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. '/env/aws_config.env'
37
+ AWS_CONFIG_PATH = get_or_create_env_var('AWS_CONFIG_PATH', '', print_val=True)
38
+
39
+ if os.path.exists(AWS_CONFIG_PATH):
40
+ print(f"Loading AWS variables from config file {AWS_CONFIG_PATH}")
41
+ load_dotenv(AWS_CONFIG_PATH)
42
+
43
+ RUN_AWS_FUNCTIONS = get_or_create_env_var("RUN_AWS_FUNCTIONS", "0")
44
+
45
+ AWS_REGION = get_or_create_env_var('AWS_REGION', 'eu-west-2')
46
+
47
+ client_id = get_or_create_env_var('AWS_CLIENT_ID', '')
48
+
49
+ client_secret = get_or_create_env_var('AWS_CLIENT_SECRET', '')
50
+
51
+ user_pool_id = get_or_create_env_var('AWS_USER_POOL_ID', '')
52
+
53
+ AWS_ACCESS_KEY = get_or_create_env_var('AWS_ACCESS_KEY', '')
54
+ if AWS_ACCESS_KEY: print(f'AWS_ACCESS_KEY found in environment variables')
55
+
56
+ AWS_SECRET_KEY = get_or_create_env_var('AWS_SECRET_KEY', '')
57
+ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
58
+
59
+ DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
60
+
61
+ # Custom headers e.g. if routing traffic through Cloudfront
62
+ # Retrieving or setting CUSTOM_HEADER
63
+ CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
64
+ if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
65
+
66
+ # Retrieving or setting CUSTOM_HEADER_VALUE
67
+ CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
68
+ if CUSTOM_HEADER_VALUE: print(f'CUSTOM_HEADER_VALUE found')
69
+
70
+ ###
71
+ # Images config
72
+ ###
73
+ IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
74
+ LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
75
+ MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
76
+
77
+ ###
78
+ # File I/O config
79
+ ###
80
+
81
+ output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
82
+ print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
83
+
84
+ session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
85
+ print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
86
+
87
+ input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
88
+ print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
89
+
90
+ ###
91
+ # REDACTION CONFIG
92
+ ###
93
+ # Number of pages to loop through before breaking the function and restarting from the last finished page.
94
+ page_break_value = get_or_create_env_var('page_break_value', '50000')
95
+
96
+ max_time_value = get_or_create_env_var('max_time_value', '999999')
97
+
98
+ CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
99
+
100
+ ###
101
+ # APP RUN CONFIG
102
+ ###
103
+ # Get some environment variables and Launch the Gradio app
104
+ COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
105
+
106
+ RUN_DIRECT_MODE = get_or_create_env_var('RUN_DIRECT_MODE', '0')
107
+
108
+ MAX_QUEUE_SIZE = int(get_or_create_env_var('MAX_QUEUE_SIZE', '5'))
109
+
110
+ MAX_FILE_SIZE = get_or_create_env_var('MAX_FILE_SIZE', '250mb')
111
+
112
+ GRADIO_SERVER_PORT = int(get_or_create_env_var('GRADIO_SERVER_PORT', '7860'))
113
+
114
+ ROOT_PATH = get_or_create_env_var('ROOT_PATH', '')
115
+
116
+ DEFAULT_CONCURRENCY_LIMIT = get_or_create_env_var('DEFAULT_CONCURRENCY_LIMIT', '5')
117
+
118
+ GET_DEFAULT_ALLOW_LIST = get_or_create_env_var('GET_DEFAULT_ALLOW_LIST', 'False')
119
+
120
+ DEFAULT_ALLOW_LIST_PATH = get_or_create_env_var('DEFAULT_ALLOW_LIST_PATH', '')
tools/custom_csvlogger.py CHANGED
@@ -8,9 +8,7 @@ from collections.abc import Sequence
8
  from multiprocessing import Lock
9
  from pathlib import Path
10
  from typing import TYPE_CHECKING, Any
11
-
12
  from gradio_client import utils as client_utils
13
-
14
  import gradio as gr
15
  from gradio import utils, wasm_utils
16
 
 
8
  from multiprocessing import Lock
9
  from pathlib import Path
10
  from typing import TYPE_CHECKING, Any
 
11
  from gradio_client import utils as client_utils
 
12
  import gradio as gr
13
  from gradio import utils, wasm_utils
14
 
tools/custom_image_analyser_engine.py CHANGED
@@ -405,7 +405,7 @@ def bounding_boxes_overlap(box1:List, box2:List):
405
  return (box1[0] < box2[2] and box2[0] < box1[2] and
406
  box1[1] < box2[3] and box2[1] < box1[3])
407
 
408
- def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results:List[Tuple]):
409
  for entity in page_analyser_result:
410
  entity_start = entity.start
411
  entity_end = entity.end
@@ -443,7 +443,7 @@ def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_li
443
 
444
  return all_text_line_results
445
 
446
- def map_back_comprehend_entity_results(response, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
447
  if not response or "Entities" not in response:
448
  return all_text_line_results
449
 
@@ -686,7 +686,7 @@ def run_page_text_redaction(
686
 
687
  return page_analysed_bounding_boxes
688
 
689
- def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
690
  '''
691
  Merge identified bounding boxes containing PII that are very close to one another
692
  '''
@@ -776,7 +776,7 @@ def merge_text_bounding_boxes(analyser_results, characters: List[LTChar], combin
776
  return analysed_bounding_boxes
777
 
778
  # Function to combine OCR results into line-level results
779
- def combine_ocr_results(ocr_results, x_threshold=50, y_threshold=12):
780
  # Group OCR results into lines based on y_threshold
781
  lines = []
782
  current_line = []
 
405
  return (box1[0] < box2[2] and box2[0] < box1[2] and
406
  box1[1] < box2[3] and box2[1] < box1[3])
407
 
408
+ def map_back_entity_results(page_analyser_result:dict, page_text_mapping:dict, all_text_line_results:List[Tuple]):
409
  for entity in page_analyser_result:
410
  entity_start = entity.start
411
  entity_end = entity.end
 
443
 
444
  return all_text_line_results
445
 
446
+ def map_back_comprehend_entity_results(response:object, current_batch_mapping:List[Tuple], allow_list:List[str], chosen_redact_comprehend_entities:List[str], all_text_line_results:List[Tuple]):
447
  if not response or "Entities" not in response:
448
  return all_text_line_results
449
 
 
686
 
687
  return page_analysed_bounding_boxes
688
 
689
+ def merge_text_bounding_boxes(analyser_results:dict, characters: List[LTChar], combine_pixel_dist: int = 20, vertical_padding: int = 0):
690
  '''
691
  Merge identified bounding boxes containing PII that are very close to one another
692
  '''
 
776
  return analysed_bounding_boxes
777
 
778
  # Function to combine OCR results into line-level results
779
+ def combine_ocr_results(ocr_results:dict, x_threshold:float=50.0, y_threshold:float=12.0):
780
  # Group OCR results into lines based on y_threshold
781
  lines = []
782
  current_line = []
tools/data_anonymise.py CHANGED
@@ -13,12 +13,11 @@ from typing import List, Dict, Any
13
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
14
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
15
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
16
- from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
17
 
18
- from tools.helper_functions import output_folder, get_file_name_without_type, read_file, detect_file_type
 
19
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
20
  from tools.custom_image_analyser_engine import do_aws_comprehend_call
21
-
22
  # Use custom version of analyze_dict to be able to track progress
23
  from tools.presidio_analyzer_custom import analyze_dict
24
 
 
13
  from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, DictAnalyzerResult, RecognizerResult
14
  from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
15
  from presidio_anonymizer.entities import OperatorConfig, ConflictResolutionStrategy
 
16
 
17
+ from tools.config import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, output_folder
18
+ from tools.helper_functions import get_file_name_without_type, read_file, detect_file_type
19
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_word_list_recogniser, CustomWordFuzzyRecognizer, custom_entities
20
  from tools.custom_image_analyser_engine import do_aws_comprehend_call
 
21
  # Use custom version of analyze_dict to be able to track progress
22
  from tools.presidio_analyzer_custom import analyze_dict
23
 
tools/file_conversion.py CHANGED
@@ -1,13 +1,14 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
- from tools.helper_functions import get_file_name_without_type, output_folder, tesseract_ocr_option, text_ocr_option, textract_option, read_file, get_or_create_env_var
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
6
  import time
7
  import json
8
  import pymupdf
 
9
  import pandas as pd
10
- import numpy as np
11
  import shutil
12
  from pymupdf import Rect
13
  from fitz import Page
@@ -19,9 +20,13 @@ from pdf2image import convert_from_path
19
  from PIL import Image
20
  from scipy.spatial import cKDTree
21
 
22
- image_dpi = 300.0
23
- ImageFile.LOAD_TRUNCATED_IMAGES = True
24
- Image.MAX_IMAGE_PIXELS = None
 
 
 
 
25
 
26
  def is_pdf_or_image(filename):
27
  """
@@ -54,8 +59,7 @@ def is_pdf(filename):
54
  # %%
55
  ## Convert pdf to image if necessary
56
 
57
- CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
58
- print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
59
 
60
  def check_image_size_and_reduce(out_path:str, image:Image):
61
  '''
@@ -360,6 +364,27 @@ def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colou
360
 
361
  return whole_page_img_annotation_box
362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
363
  def prepare_image_or_pdf(
364
  file_paths: List[str],
365
  in_redact_method: str,
@@ -371,6 +396,7 @@ def prepare_image_or_pdf(
371
  prepare_for_review:bool = False,
372
  in_fully_redacted_list:List[int]=[],
373
  output_folder:str=output_folder,
 
374
  progress: Progress = Progress(track_tqdm=True)
375
  ) -> tuple[List[str], List[str]]:
376
  """
@@ -390,6 +416,7 @@ def prepare_image_or_pdf(
390
  prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
391
  in_fully_redacted_list(optional, List of int): A list of pages to fully redact
392
  output_folder (optional, str): The output folder for file save
 
393
  progress (optional, Progress): Progress tracker for the operation
394
 
395
 
@@ -400,6 +427,10 @@ def prepare_image_or_pdf(
400
  tic = time.perf_counter()
401
  json_from_csv = False
402
  original_cropboxes = [] # Store original CropBox values
 
 
 
 
403
 
404
  if isinstance(in_fully_redacted_list, pd.DataFrame):
405
  if not in_fully_redacted_list.empty:
@@ -426,11 +457,6 @@ def prepare_image_or_pdf(
426
  if isinstance(out_message, str):
427
  out_message = [out_message]
428
 
429
- converted_file_paths = []
430
- image_file_paths = []
431
- pymupdf_doc = []
432
- review_file_csv = pd.DataFrame()
433
-
434
  if not file_paths:
435
  file_paths = []
436
 
@@ -496,23 +522,35 @@ def prepare_image_or_pdf(
496
  # If a pdf, load as a pymupdf document
497
  if is_pdf(file_path):
498
  pymupdf_doc = pymupdf.open(file_path)
 
499
 
500
  # Load cropbox dimensions to use later
501
 
502
  converted_file_path = file_path
503
- image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
504
- page_sizes = []
505
 
506
- for i, page in enumerate(pymupdf_doc):
507
- page_no = i
508
- reported_page_no = i + 1
 
 
 
 
 
 
 
 
 
 
 
509
 
510
- pymupdf_page = pymupdf_doc.load_page(page_no)
511
- original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
 
 
 
 
512
 
513
- # Create a page_sizes_object
514
- out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
515
- page_sizes.append(out_page_image_sizes)
516
 
517
  #Create base version of the annotation object that doesn't have any annotations in it
518
  if (not all_annotations_object) & (prepare_for_review == True):
@@ -521,6 +559,7 @@ def prepare_image_or_pdf(
521
  for image_path in image_file_paths:
522
  annotation = {}
523
  annotation["image"] = image_path
 
524
 
525
  all_annotations_object.append(annotation)
526
 
@@ -546,7 +585,7 @@ def prepare_image_or_pdf(
546
 
547
  #print("image_file_paths:", image_file_paths)
548
  # Create a page_sizes_object
549
- out_page_image_sizes = {"page":1, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
550
  page_sizes.append(out_page_image_sizes)
551
 
552
  converted_file_path = output_folder + file_name_with_ext
@@ -557,7 +596,7 @@ def prepare_image_or_pdf(
557
 
558
  elif file_extension in ['.csv']:
559
  review_file_csv = read_file(file)
560
- all_annotations_object = convert_pandas_df_to_review_json(review_file_csv, image_file_paths, page_sizes)
561
  json_from_csv = True
562
  print("Converted CSV review file to json")
563
 
@@ -708,7 +747,7 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
708
 
709
  return out_message, out_file_paths
710
 
711
- def join_values_within_threshold(df1, df2):
712
  # Threshold for matching
713
  threshold = 5
714
 
@@ -739,7 +778,7 @@ def join_values_within_threshold(df1, df2):
739
  print(final_df)
740
 
741
 
742
- def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame(), page_sizes:List[dict]=[]) -> pd.DataFrame:
743
  '''
744
  Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
745
  '''
@@ -887,7 +926,7 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
887
  # review_file_df[col] = np.floor(review_file_df[col])
888
 
889
  # If colours are saved as list, convert to tuple
890
- review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
891
 
892
  # print("page_sizes:", page_sizes)
893
 
@@ -910,32 +949,35 @@ def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decis
910
 
911
  review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
912
 
913
- review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
914
 
915
  return review_file_df
916
 
917
- def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image], page_sizes:List[dict]=[]) -> List[dict]:
918
  '''
919
  Convert a review csv to a json file for use by the Gradio Annotation object.
920
  '''
921
 
922
- if page_sizes:
923
-
924
  page_sizes_df = pd.DataFrame(page_sizes)
925
 
926
- #print(page_sizes_df)
 
 
927
 
928
- if "image_width" not in review_file_df.columns:
929
- review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
930
 
931
- #print("review_file_df in convert pandas df to review json function:", review_file_df[["xmin", "xmax", "ymin", "ymax"]])
932
-
933
- # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
934
- if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
935
- review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
936
- review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
937
- review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
938
- review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
 
939
 
940
  # Keep only necessary columns
941
  review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
@@ -949,9 +991,8 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
949
  # Create a list to hold the JSON data
950
  json_data = []
951
 
952
- for n, pdf_image_path in enumerate(image_paths):
953
- reported_page_number = int(n + 1)
954
-
955
 
956
  if reported_page_number in review_file_df["page"].values:
957
 
@@ -969,6 +1010,7 @@ def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths:
969
  else:
970
  annotation = {}
971
  annotation["image"] = pdf_image_path
 
972
 
973
  # Append the structured data to the json_data list
974
  json_data.append(annotation)
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+
3
  from PIL import Image, ImageFile
4
  import os
5
  import re
6
  import time
7
  import json
8
  import pymupdf
9
+ from pymupdf import Document
10
  import pandas as pd
11
+ #import numpy as np
12
  import shutil
13
  from pymupdf import Rect
14
  from fitz import Page
 
20
  from PIL import Image
21
  from scipy.spatial import cKDTree
22
 
23
+ from tools.config import output_folder, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
24
+ from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
25
+
26
+ image_dpi = float(IMAGES_DPI)
27
+ if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
28
+ else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
29
+ ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
30
 
31
  def is_pdf_or_image(filename):
32
  """
 
59
  # %%
60
  ## Convert pdf to image if necessary
61
 
62
+
 
63
 
64
  def check_image_size_and_reduce(out_path:str, image:Image):
65
  '''
 
364
 
365
  return whole_page_img_annotation_box
366
 
367
+ def create_page_size_objects(pymupdf_doc:Document, image_sizes_width:List[float], image_sizes_height:List[float]):
368
+ page_sizes = []
369
+ original_cropboxes = []
370
+
371
+ for page_no, page in enumerate(pymupdf_doc):
372
+ reported_page_no = page_no + 1
373
+
374
+ pymupdf_page = pymupdf_doc.load_page(page_no)
375
+ original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
376
+
377
+ # Create a page_sizes_object.
378
+ # If images have been created, then image width an height come from this value. Otherwise, they are set to the cropbox size
379
+ if image_sizes_width and image_sizes_height:
380
+ out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
381
+ else:
382
+ out_page_image_sizes = {"page":reported_page_no, "image_width":pd.NA(), "image_height":pd.NA(), "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
383
+
384
+ page_sizes.append(out_page_image_sizes)
385
+
386
+ return page_sizes, original_cropboxes
387
+
388
  def prepare_image_or_pdf(
389
  file_paths: List[str],
390
  in_redact_method: str,
 
396
  prepare_for_review:bool = False,
397
  in_fully_redacted_list:List[int]=[],
398
  output_folder:str=output_folder,
399
+ prepare_images:bool=True,
400
  progress: Progress = Progress(track_tqdm=True)
401
  ) -> tuple[List[str], List[str]]:
402
  """
 
416
  prepare_for_review(optional, bool): Is this preparation step preparing pdfs and json files to review current redactions?
417
  in_fully_redacted_list(optional, List of int): A list of pages to fully redact
418
  output_folder (optional, str): The output folder for file save
419
+ prepare_images (optional, bool): A boolean indicating whether to create images for each PDF page. Defaults to true
420
  progress (optional, Progress): Progress tracker for the operation
421
 
422
 
 
427
  tic = time.perf_counter()
428
  json_from_csv = False
429
  original_cropboxes = [] # Store original CropBox values
430
+ converted_file_paths = []
431
+ image_file_paths = []
432
+ pymupdf_doc = []
433
+ review_file_csv = pd.DataFrame()
434
 
435
  if isinstance(in_fully_redacted_list, pd.DataFrame):
436
  if not in_fully_redacted_list.empty:
 
457
  if isinstance(out_message, str):
458
  out_message = [out_message]
459
 
 
 
 
 
 
460
  if not file_paths:
461
  file_paths = []
462
 
 
522
  # If a pdf, load as a pymupdf document
523
  if is_pdf(file_path):
524
  pymupdf_doc = pymupdf.open(file_path)
525
+ pymupdf_pages = pymupdf_doc.page_count
526
 
527
  # Load cropbox dimensions to use later
528
 
529
  converted_file_path = file_path
 
 
530
 
531
+ if prepare_images==True:
532
+ image_file_paths, image_sizes_width, image_sizes_height = process_file(file_path, prepare_for_review)
533
+ else:
534
+ print("Skipping image preparation")
535
+ image_file_paths=[]
536
+ image_sizes_width=[]
537
+ image_sizes_height=[]
538
+
539
+ # Create page sizes object
540
+ # page_sizes = []
541
+
542
+ # for i, page in enumerate(pymupdf_doc):
543
+ # page_no = i
544
+ # reported_page_no = i + 1
545
 
546
+ # pymupdf_page = pymupdf_doc.load_page(page_no)
547
+ # original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
548
+
549
+ # # Create a page_sizes_object
550
+ # out_page_image_sizes = {"page":reported_page_no, "image_width":image_sizes_width[page_no], "image_height":image_sizes_height[page_no], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":pymupdf_page.cropbox.width, "cropbox_height":pymupdf_page.cropbox.height}
551
+ # page_sizes.append(out_page_image_sizes)
552
 
553
+ page_sizes, original_cropboxes = create_page_size_objects(pymupdf_doc, image_sizes_width, image_sizes_height)
 
 
554
 
555
  #Create base version of the annotation object that doesn't have any annotations in it
556
  if (not all_annotations_object) & (prepare_for_review == True):
 
559
  for image_path in image_file_paths:
560
  annotation = {}
561
  annotation["image"] = image_path
562
+ annotation["boxes"] = []
563
 
564
  all_annotations_object.append(annotation)
565
 
 
585
 
586
  #print("image_file_paths:", image_file_paths)
587
  # Create a page_sizes_object
588
+ out_page_image_sizes = {"page":1, "image_width":image_sizes_width[0], "image_height":image_sizes_height[0], "mediabox_width":pymupdf_page.mediabox.width, "mediabox_height": pymupdf_page.mediabox.height, "cropbox_width":original_cropboxes[-1].width, "cropbox_height":original_cropboxes[-1].height}
589
  page_sizes.append(out_page_image_sizes)
590
 
591
  converted_file_path = output_folder + file_name_with_ext
 
596
 
597
  elif file_extension in ['.csv']:
598
  review_file_csv = read_file(file)
599
+ all_annotations_object = convert_review_df_to_annotation_json(review_file_csv, image_file_paths, page_sizes)
600
  json_from_csv = True
601
  print("Converted CSV review file to json")
602
 
 
747
 
748
  return out_message, out_file_paths
749
 
750
+ def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
751
  # Threshold for matching
752
  threshold = 5
753
 
 
778
  print(final_df)
779
 
780
 
781
+ def convert_annotation_json_to_review_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame(), page_sizes:List[dict]=[]) -> pd.DataFrame:
782
  '''
783
  Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
784
  '''
 
926
  # review_file_df[col] = np.floor(review_file_df[col])
927
 
928
  # If colours are saved as list, convert to tuple
929
+ review_file_df.loc[:,"color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
930
 
931
  # print("page_sizes:", page_sizes)
932
 
 
949
 
950
  review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
951
 
952
+ #review_file_df.to_csv(output_folder + "review_file_test.csv", index=None)
953
 
954
  return review_file_df
955
 
956
+ def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame, image_paths:List[Image.Image], page_sizes:List[dict]=[]) -> List[dict]:
957
  '''
958
  Convert a review csv to a json file for use by the Gradio Annotation object.
959
  '''
960
 
961
+ # Convert relative co-ordinates into image coordinates for the image annotation output object
962
+ if page_sizes:
963
  page_sizes_df = pd.DataFrame(page_sizes)
964
 
965
+ # If there are no image coordinates, then just convert the first page to image to be able to see this at least.
966
+ if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
967
+ print("No image dimensions found, converting first page.")
968
 
969
+ # If no nulls, then can do image coordinate conversion
970
+ elif len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == 0:
971
 
972
+ if "image_width" not in review_file_df.columns:
973
+ review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
974
+
975
+ # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
976
+ if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
977
+ review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
978
+ review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
979
+ review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
980
+ review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
981
 
982
  # Keep only necessary columns
983
  review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
 
991
  # Create a list to hold the JSON data
992
  json_data = []
993
 
994
+ for page_no, pdf_image_path in enumerate(image_paths):
995
+ reported_page_number = int(page_no + 1)
 
996
 
997
  if reported_page_number in review_file_df["page"].values:
998
 
 
1010
  else:
1011
  annotation = {}
1012
  annotation["image"] = pdf_image_path
1013
+ annotation["boxes"] = []
1014
 
1015
  # Append the structured data to the json_data list
1016
  json_data.append(annotation)
tools/file_redaction.py CHANGED
@@ -8,38 +8,29 @@ import copy
8
 
9
  from tqdm import tqdm
10
  from PIL import Image, ImageChops, ImageFile, ImageDraw
11
- ImageFile.LOAD_TRUNCATED_IMAGES = True
12
  from typing import List, Dict, Tuple
13
  import pandas as pd
14
 
15
- #from presidio_image_redactor.entities import ImageRecognizerResult
16
  from pdfminer.high_level import extract_pages
17
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
18
  from pikepdf import Pdf, Dictionary, Name
19
- import pymupdf
20
- from pymupdf import Rect
21
- from fitz import Page
22
  import gradio as gr
23
  from gradio import Progress
24
  from collections import defaultdict # For efficient grouping
25
 
26
- from presidio_analyzer import RecognizerResult
27
- from tools.aws_functions import RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY
28
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
29
- from tools.file_conversion import process_file, image_dpi, convert_review_json_to_pandas_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
30
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
31
- from tools.helper_functions import get_file_name_without_type, output_folder, clean_unicode_text, get_or_create_env_var, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
32
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
33
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
34
- from tools.presidio_analyzer_custom import recognizer_result_from_dict
35
-
36
- # Number of pages to loop through before breaking. Currently set very high, as functions are breaking on time metrics (e.g. every 105 seconds), rather than on number of pages redacted.
37
- page_break_value = get_or_create_env_var('page_break_value', '50000')
38
- print(f'The value of page_break_value is {page_break_value}')
39
-
40
- max_time_value = get_or_create_env_var('max_time_value', '999999')
41
- print(f'The value of max_time_value is {max_time_value}')
42
 
 
 
 
 
43
 
44
  def bounding_boxes_overlap(box1, box2):
45
  """Check if two bounding boxes overlap."""
@@ -103,6 +94,7 @@ def choose_and_run_redactor(file_paths:List[str],
103
  review_file_state:pd.DataFrame=[],
104
  output_folder:str=output_folder,
105
  document_cropboxes:List=[],
 
106
  progress=gr.Progress(track_tqdm=True)):
107
  '''
108
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -143,6 +135,7 @@ def choose_and_run_redactor(file_paths:List[str],
143
  - review_file_state (pd.DataFrame, optional): Output review file dataframe.
144
  - output_folder (str, optional): Output folder for results.
145
  - document_cropboxes (List, optional): List of document cropboxes for the PDF.
 
146
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
147
 
148
  The function returns a redacted document along with processing logs.
@@ -239,7 +232,7 @@ def choose_and_run_redactor(file_paths:List[str],
239
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
240
  print("Estimated total processing time:", str(estimate_total_processing_time))
241
 
242
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
243
 
244
  # If we have reached the last page, return message and outputs
245
  if current_loop_page >= number_of_pages:
@@ -255,7 +248,7 @@ def choose_and_run_redactor(file_paths:List[str],
255
 
256
  review_out_file_paths.extend(out_review_file_path)
257
 
258
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes
259
 
260
  # Create allow list
261
  # If string, assume file path
@@ -484,7 +477,7 @@ def choose_and_run_redactor(file_paths:List[str],
484
  #print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
485
  #print("page_sizes before in choose and run redactor:", page_sizes)
486
 
487
- review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table, page_sizes)
488
 
489
  #print("annotation_all_pages:", annotations_all_pages)
490
  #print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
@@ -560,7 +553,7 @@ def choose_and_run_redactor(file_paths:List[str],
560
  out_file_paths = list(set(out_file_paths))
561
  review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
562
 
563
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes
564
 
565
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
566
  '''
 
8
 
9
  from tqdm import tqdm
10
  from PIL import Image, ImageChops, ImageFile, ImageDraw
 
11
  from typing import List, Dict, Tuple
12
  import pandas as pd
13
 
 
14
  from pdfminer.high_level import extract_pages
15
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTTextLineHorizontal, LTAnno
16
  from pikepdf import Pdf, Dictionary, Name
17
+ from pymupdf import Rect, Page
 
 
18
  import gradio as gr
19
  from gradio import Progress
20
  from collections import defaultdict # For efficient grouping
21
 
22
+ from tools.config import output_folder, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, page_break_value, max_time_value, LOAD_TRUNCATED_IMAGES
 
23
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
24
+ from tools.file_conversion import process_file, convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords
25
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
26
+ from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector
27
  from tools.file_conversion import process_file, is_pdf, is_pdf_or_image, prepare_image_or_pdf
28
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
 
 
 
 
 
 
 
 
29
 
30
+ ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
31
+ if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
32
+ else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
33
+ image_dpi = float(IMAGES_DPI)
34
 
35
  def bounding_boxes_overlap(box1, box2):
36
  """Check if two bounding boxes overlap."""
 
94
  review_file_state:pd.DataFrame=[],
95
  output_folder:str=output_folder,
96
  document_cropboxes:List=[],
97
+ page_sizes:List[dict]=[],
98
  progress=gr.Progress(track_tqdm=True)):
99
  '''
100
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
 
135
  - review_file_state (pd.DataFrame, optional): Output review file dataframe.
136
  - output_folder (str, optional): Output folder for results.
137
  - document_cropboxes (List, optional): List of document cropboxes for the PDF.
138
+ - page_sizes (List[dict], optional): List of dictionaries of PDF page sizes in PDF or image format.
139
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
140
 
141
  The function returns a redacted document along with processing logs.
 
232
  estimate_total_processing_time = sum_numbers_before_seconds(combined_out_message)
233
  print("Estimated total processing time:", str(estimate_total_processing_time))
234
 
235
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
236
 
237
  # If we have reached the last page, return message and outputs
238
  if current_loop_page >= number_of_pages:
 
248
 
249
  review_out_file_paths.extend(out_review_file_path)
250
 
251
+ return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = False, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, page_sizes, document_cropboxes
252
 
253
  # Create allow list
254
  # If string, assume file path
 
477
  #print("all_decision_process_table before in choose and run redactor:", all_decision_process_table)
478
  #print("page_sizes before in choose and run redactor:", page_sizes)
479
 
480
+ review_df = convert_annotation_json_to_review_df(annotations_all_pages, all_decision_process_table, page_sizes)
481
 
482
  #print("annotation_all_pages:", annotations_all_pages)
483
  #print("all_decision_process_table after in choose and run redactor:", all_decision_process_table)
 
553
  out_file_paths = list(set(out_file_paths))
554
  review_out_file_paths = [prepared_pdf_file_paths[0], out_review_file_path]
555
 
556
+ return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_df, page_sizes, document_cropboxes
557
 
558
  def convert_pikepdf_coords_to_pymupdf(pymupdf_page, pikepdf_bbox, type="pikepdf_annot"):
559
  '''
tools/find_duplicate_pages.py CHANGED
@@ -1,6 +1,6 @@
1
  import pandas as pd
2
- import argparse
3
- import glob
4
  import os
5
  import re
6
  from tools.helper_functions import output_folder
 
1
  import pandas as pd
2
+ #import argparse
3
+ #import glob
4
  import os
5
  import re
6
  from tools.helper_functions import output_folder
tools/helper_functions.py CHANGED
@@ -9,19 +9,7 @@ import unicodedata
9
  from typing import List
10
  from gradio_image_annotation import image_annotator
11
  from tools.auth import user_pool_id
12
-
13
-
14
- def get_or_create_env_var(var_name, default_value):
15
- # Get the environment variable if it exists
16
- value = os.environ.get(var_name)
17
-
18
- # If it doesn't exist, set it to the default value
19
- if value is None:
20
- os.environ[var_name] = default_value
21
- value = default_value
22
-
23
- return value
24
-
25
 
26
  # Names for options labels
27
  text_ocr_option = "Local model - selectable text"
@@ -31,24 +19,6 @@ textract_option = "AWS Textract service - all PDF types"
31
  local_pii_detector = "Local"
32
  aws_pii_detector = "AWS Comprehend"
33
 
34
- output_folder = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/')
35
- print(f'The value of GRADIO_OUTPUT_FOLDER is {output_folder}')
36
-
37
- session_output_folder = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False')
38
- print(f'The value of SESSION_OUTPUT_FOLDER is {session_output_folder}')
39
-
40
- input_folder = get_or_create_env_var('GRADIO_INPUT_FOLDER', 'input/')
41
- print(f'The value of GRADIO_INPUT_FOLDER is {input_folder}')
42
-
43
- # Retrieving or setting CUSTOM_HEADER
44
- CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
45
- print(f'CUSTOM_HEADER found')
46
-
47
- # Retrieving or setting CUSTOM_HEADER_VALUE
48
- CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
49
- print(f'CUSTOM_HEADER_VALUE found')
50
-
51
-
52
  def reset_state_vars():
53
  return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
54
  label="Modify redaction boxes",
@@ -268,24 +238,8 @@ def merge_csv_files(file_list):
268
 
269
  return output_files
270
 
271
-
272
-
273
  async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
274
 
275
- #print("request user:", request.username)
276
-
277
- #request_data = await request.json() # Parse JSON body
278
- #print("All request data:", request_data)
279
- #context_value = request_data.get('context')
280
- #if 'context' in request_data:
281
- # print("Request context dictionary:", request_data['context'])
282
-
283
- # print("Request headers dictionary:", request.headers)
284
- # print("All host elements", request.client)
285
- # print("IP address:", request.client.host)
286
- # print("Query parameters:", dict(request.query_params))
287
- # To get the underlying FastAPI items you would need to use await and some fancy @ stuff for a live query: https://fastapi.tiangolo.com/vi/reference/request/
288
- #print("Request dictionary to object:", request.request.body())
289
  print("Session hash:", request.session_hash)
290
 
291
  if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
 
9
  from typing import List
10
  from gradio_image_annotation import image_annotator
11
  from tools.auth import user_pool_id
12
+ from tools.config import CUSTOM_HEADER_VALUE, CUSTOM_HEADER, output_folder, session_output_folder
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  # Names for options labels
15
  text_ocr_option = "Local model - selectable text"
 
19
  local_pii_detector = "Local"
20
  aws_pii_detector = "AWS Comprehend"
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def reset_state_vars():
23
  return [], [], pd.DataFrame(), pd.DataFrame(), 0, "", image_annotator(
24
  label="Modify redaction boxes",
 
238
 
239
  return output_files
240
 
 
 
241
  async def get_connection_params(request: gr.Request, output_folder_textbox:str='/output/'):
242
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  print("Session hash:", request.session_hash)
244
 
245
  if CUSTOM_HEADER and CUSTOM_HEADER_VALUE:
tools/presidio_analyzer_custom.py CHANGED
@@ -1,8 +1,8 @@
1
  import gradio as gr
2
  from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
3
- from tqdm import tqdm
4
 
5
- from presidio_analyzer import DictAnalyzerResult, RecognizerResult #, AnalyzerEngine
6
  from presidio_analyzer.nlp_engine import NlpArtifacts
7
 
8
  def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
 
1
  import gradio as gr
2
  from typing import List, Iterable, Dict, Union, Any, Optional, Iterator, Tuple
3
+ #from tqdm import tqdm
4
 
5
+ from presidio_analyzer import DictAnalyzerResult, RecognizerResult
6
  from presidio_analyzer.nlp_engine import NlpArtifacts
7
 
8
  def recognizer_result_from_dict(data: Dict) -> RecognizerResult:
tools/redaction_review.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
@@ -7,18 +9,18 @@ import uuid
7
  from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
10
- from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df, convert_pandas_df_to_review_json, CUSTOM_BOX_COLOUR
11
- from tools.helper_functions import get_file_name_without_type, output_folder, detect_file_type
12
- from tools.file_redaction import redact_page_with_pymupdf
13
- import json
14
- import os
15
- import re
16
  import pymupdf
17
- from fitz import Document, Rect
18
  from PIL import ImageDraw, Image
19
  from collections import defaultdict
20
 
21
- Image.MAX_IMAGE_PIXELS = None
 
 
 
 
 
22
 
23
  def decrease_page(number:int):
24
  '''
@@ -110,9 +112,7 @@ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:Annot
110
  recogniser_dataframe_out = recogniser_dataframe_modified
111
 
112
  try:
113
- review_dataframe = convert_review_json_to_pandas_df(image_annotator_object, review_df, page_sizes)
114
-
115
- print("in get_filtered_recogniser_dataframe_and_dropdowns, recogniser_dropdown_value:", recogniser_dropdown_value)
116
 
117
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
118
  recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
@@ -140,7 +140,6 @@ def get_filtered_recogniser_dataframe_and_dropdowns(image_annotator_object:Annot
140
 
141
  return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
142
 
143
-
144
  def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
145
  '''
146
  Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
@@ -168,7 +167,6 @@ def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, reco
168
 
169
  return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
170
 
171
-
172
  def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
173
  return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
174
 
@@ -191,15 +189,24 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame, selected_rows
191
  # Keep only the rows that do not have a match in selected_rows_df
192
  out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
193
 
194
- out_image_annotations_state = convert_pandas_df_to_review_json(out_review_df, image_file_paths, page_sizes)
195
- recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
196
 
 
197
  else:
198
  out_review_df = review_df
199
- recogniser_entity_dataframe_base = pd.DataFrame()
200
- out_image_annotations_state = {}
 
 
 
 
 
 
 
 
201
 
202
- return out_review_df, out_image_annotations_state, recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
203
 
204
  def update_annotator(image_annotator_object:AnnotatedImageData,
205
  page_num:int,
@@ -315,8 +322,6 @@ def modify_existing_page_redactions(image_annotator_object:AnnotatedImageData,
315
  if not current_page:
316
  current_page = 1
317
 
318
- print("in modify_existing_page_redactions - recogniser_entities_dropdown_value:", recogniser_entities_dropdown_value)
319
-
320
  image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
321
 
322
  if clear_all == False:
@@ -471,10 +476,10 @@ def apply_redactions(image_annotator_object:AnnotatedImageData,
471
  #print("page_sizes before conversion in apply redactions:", page_sizes)
472
 
473
  # Convert json to csv and also save this
474
- review_df = convert_review_json_to_pandas_df(all_image_annotations, review_file_state, page_sizes=page_sizes)
475
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
476
 
477
- print("Saving review file after convert_review_json function in apply redactions")
478
  review_df.to_csv(out_review_file_file_path, index=None)
479
  output_files.append(out_review_file_file_path)
480
 
@@ -589,6 +594,9 @@ def update_entities_df_text(choice:str, df:pd.DataFrame, label_dropdown_value:st
589
  return filtered_df, recogniser_entities_drop, page_entities_drop
590
 
591
  def reset_dropdowns():
 
 
 
592
  return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
593
 
594
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
@@ -612,10 +620,13 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
612
  - image_width: Width of the source image
613
  - image_height: Height of the source image
614
  - x1, y1, x2, y2: Coordinates in image space
 
615
 
616
  Returns:
617
  - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
618
  '''
 
 
619
 
620
  # Calculate scaling factors
621
  scale_width = pdf_page_width / image_width
@@ -636,12 +647,34 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
636
 
637
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
638
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
 
640
- def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[]):
641
  '''
642
  Create an xfdf file from a review csv file and a pdf
643
  '''
644
-
 
645
  # Create root element
646
  xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
647
 
@@ -651,13 +684,49 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
651
 
652
  # Add annots
653
  annots = SubElement(xfdf, 'annots')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
- for _, row in df.iterrows():
 
656
  page_python_format = int(row["page"])-1
657
 
658
  pymupdf_page = pymupdf_doc.load_page(page_python_format)
659
 
660
- # Load cropbox sizes
661
  if document_cropboxes:
662
  #print("Document cropboxes:", document_cropboxes)
663
 
@@ -672,13 +741,12 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
672
  else:
673
  print("Document cropboxes not found.")
674
 
 
675
  pdf_page_height = pymupdf_page.mediabox.height
676
  pdf_page_width = pymupdf_page.mediabox.width
677
 
678
  image = image_paths[page_python_format]
679
 
680
- #print("image:", image)
681
-
682
  if isinstance(image, str):
683
  image = Image.open(image)
684
 
@@ -695,16 +763,22 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
695
  redact_annot.set('page', str(int(row['page']) - 1))
696
 
697
  # Convert coordinates
698
- x1, y1, x2, y2 = convert_image_coords_to_adobe(
699
- pdf_page_width,
700
- pdf_page_height,
701
- image_page_width,
702
- image_page_height,
703
- row['xmin'],
704
- row['ymin'],
705
- row['xmax'],
706
- row['ymax']
707
- )
 
 
 
 
 
 
708
 
709
  if CUSTOM_BOX_COLOUR == "grey":
710
  colour_str = "0.5,0.5,0.5"
@@ -756,12 +830,13 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:L
756
 
757
  return xml_str
758
 
759
- def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[]):
760
  '''
761
  Load in files to convert a review file into an Adobe comment file format
762
  '''
763
  output_paths = []
764
  pdf_name = ""
 
765
 
766
  if isinstance(input_files, str):
767
  file_paths_list = [input_files]
@@ -778,29 +853,29 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], ou
778
  else:
779
  file_path = file.name
780
 
781
- file_path_name = get_file_name_without_type(file_path)
782
- file_path_end = detect_file_type(file_path)
783
 
784
- if file_path_end == "pdf":
785
- pdf_name = os.path.basename(file_path)
786
 
787
- if file_path_end == "csv":
788
- # If no pdf name, just get the name of the file path
789
- if not pdf_name:
790
- pdf_name = file_path_name
791
- # Read CSV file
792
- df = pd.read_csv(file_path)
793
 
794
- df.fillna('', inplace=True) # Replace NaN with an empty string
795
 
796
- xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths, document_cropboxes)
797
 
798
- output_path = output_folder + file_path_name + "_adobe.xfdf"
799
-
800
- with open(output_path, 'w', encoding='utf-8') as f:
801
- f.write(xfdf_content)
802
 
803
- output_paths.append(output_path)
804
 
805
  return output_paths
806
 
@@ -841,7 +916,7 @@ def convert_adobe_coords_to_image(pdf_page_width:float, pdf_page_height:float, i
841
 
842
  return image_x1, image_y1, image_x2, image_y2
843
 
844
- def parse_xfdf(xfdf_path):
845
  '''
846
  Parse the XFDF file and extract redaction annotations.
847
 
 
1
+ import os
2
+ import re
3
  import gradio as gr
4
  import pandas as pd
5
  import numpy as np
 
9
  from typing import List
10
  from gradio_image_annotation import image_annotator
11
  from gradio_image_annotation.image_annotator import AnnotatedImageData
12
+ from pymupdf import Document, Rect
 
 
 
 
 
13
  import pymupdf
14
+ #from fitz
15
  from PIL import ImageDraw, Image
16
  from collections import defaultdict
17
 
18
+ from tools.config import output_folder, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS
19
+ from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json
20
+ from tools.helper_functions import get_file_name_without_type, detect_file_type
21
+ from tools.file_redaction import redact_page_with_pymupdf
22
+
23
+ if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
24
 
25
  def decrease_page(number:int):
26
  '''
 
112
  recogniser_dataframe_out = recogniser_dataframe_modified
113
 
114
  try:
115
+ review_dataframe = convert_annotation_json_to_review_df(image_annotator_object, review_df, page_sizes)
 
 
116
 
117
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
118
  recogniser_entities_drop = gr.Dropdown(value=recogniser_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
 
140
 
141
  return recogniser_dataframe_out, recogniser_dataframe_out, recogniser_entities_drop, recogniser_entities_list, text_entities_drop, page_entities_drop
142
 
 
143
  def update_recogniser_dataframes(image_annotator_object:AnnotatedImageData, recogniser_dataframe_modified:pd.DataFrame, recogniser_entities_dropdown_value:str="ALL", text_dropdown_value:str="ALL", page_dropdown_value:str="ALL", review_df:pd.DataFrame=[], page_sizes:list[str]=[]):
144
  '''
145
  Update recogniser dataframe information that appears alongside the pdf pages on the review screen.
 
167
 
168
  return recogniser_entities_list, recogniser_dataframe_out, recogniser_dataframe_modified, recogniser_entities_drop, text_entities_drop, page_entities_drop
169
 
 
170
  def undo_last_removal(backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base):
171
  return backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
172
 
 
189
  # Keep only the rows that do not have a match in selected_rows_df
190
  out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
191
 
192
+ out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
193
+ out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
194
 
195
+ # Either there is nothing left in the selection dataframe, or the review dataframe
196
  else:
197
  out_review_df = review_df
198
+ out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
199
+
200
+ out_image_annotations_state = []
201
+
202
+ for page_no, page in enumerate(image_file_paths):
203
+ annotation = {}
204
+ annotation["image"] = image_file_paths[page_no]
205
+ annotation["boxes"] = []
206
+
207
+ out_image_annotations_state.append(annotation)
208
 
209
+ return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
210
 
211
  def update_annotator(image_annotator_object:AnnotatedImageData,
212
  page_num:int,
 
322
  if not current_page:
323
  current_page = 1
324
 
 
 
325
  image_annotator_object['image'] = all_image_annotations[previous_page - 1]["image"]
326
 
327
  if clear_all == False:
 
476
  #print("page_sizes before conversion in apply redactions:", page_sizes)
477
 
478
  # Convert json to csv and also save this
479
+ review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state, page_sizes=page_sizes)[["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"]]
480
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
481
 
482
+ #print("Saving review file after convert_annotation_json_to_review_df function in apply redactions")
483
  review_df.to_csv(out_review_file_file_path, index=None)
484
  output_files.append(out_review_file_file_path)
485
 
 
594
  return filtered_df, recogniser_entities_drop, page_entities_drop
595
 
596
  def reset_dropdowns():
597
+ '''
598
+ Return Gradio dropdown objects with value 'ALL'.
599
+ '''
600
  return gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True), gr.Dropdown(value="ALL", allow_custom_value=True)
601
 
602
  def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
 
620
  - image_width: Width of the source image
621
  - image_height: Height of the source image
622
  - x1, y1, x2, y2: Coordinates in image space
623
+ - page_sizes: List of dicts containing sizes of page as pymupdf page or PIL image
624
 
625
  Returns:
626
  - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
627
  '''
628
+
629
+
630
 
631
  # Calculate scaling factors
632
  scale_width = pdf_page_width / image_width
 
647
 
648
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
649
 
650
+ def convert_pymupdf_coords_to_adobe(x1: float, y1: float, x2: float, y2: float):
651
+ """
652
+ Converts coordinates from PyMuPDF (fitz) space to Adobe PDF space.
653
+
654
+ Parameters:
655
+ - pdf_page_width: Width of the PDF page
656
+ - pdf_page_height: Height of the PDF page
657
+ - x1, y1, x2, y2: Coordinates in PyMuPDF space
658
+
659
+ Returns:
660
+ - Tuple of converted coordinates (x1, y1, x2, y2) in Adobe PDF space
661
+ """
662
+
663
+ # PyMuPDF and Adobe PDF coordinates are similar, but ensure y1 is always the lower value
664
+ pdf_x1, pdf_x2 = x1, x2
665
+
666
+ # Ensure y1 is the bottom coordinate and y2 is the top
667
+ pdf_y1, pdf_y2 = min(y1, y2), max(y1, y2)
668
+
669
+ return pdf_x1, pdf_y1, pdf_x2, pdf_y2
670
+
671
 
672
+ def create_xfdf(review_file_df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[], page_sizes:List[dict]=[]):
673
  '''
674
  Create an xfdf file from a review csv file and a pdf
675
  '''
676
+ pages_are_images = True
677
+
678
  # Create root element
679
  xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
680
 
 
684
 
685
  # Add annots
686
  annots = SubElement(xfdf, 'annots')
687
+
688
+ # Check if page size object exists, and if current coordinates are in relative format or image coordinates format.
689
+ if page_sizes:
690
+ page_sizes_df = pd.DataFrame(page_sizes)
691
+
692
+ # If there are no image coordinates, then convert coordinates to pymupdf coordinates prior to export
693
+ if len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == len(page_sizes_df["image_width"]):
694
+ print("No image dimensions found, using pymupdf coordinates for conversion.")
695
+
696
+ if "mediabox_width" not in review_file_df.columns:
697
+ review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
698
+
699
+ # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
700
+ if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
701
+ review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["mediabox_width"]
702
+ review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["mediabox_width"]
703
+ review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["mediabox_height"]
704
+ review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["mediabox_height"]
705
+
706
+ pages_are_images = False
707
+
708
+ # If no nulls, then can do image coordinate conversion
709
+ elif len(page_sizes_df.loc[page_sizes_df["image_width"].isnull(),"image_width"]) == 0:
710
+
711
+ if "image_width" not in review_file_df.columns:
712
+ review_file_df = review_file_df.merge(page_sizes_df, how="left", on = "page")
713
+
714
+ # If all coordinates are less or equal to one, this is a relative page scaling - change back to image coordinates
715
+ if review_file_df["xmin"].max() <= 1 and review_file_df["xmax"].max() <= 1 and review_file_df["ymin"].max() <= 1 and review_file_df["ymax"].max() <= 1:
716
+ review_file_df["xmin"] = review_file_df["xmin"] * review_file_df["image_width"]
717
+ review_file_df["xmax"] = review_file_df["xmax"] * review_file_df["image_width"]
718
+ review_file_df["ymin"] = review_file_df["ymin"] * review_file_df["image_height"]
719
+ review_file_df["ymax"] = review_file_df["ymax"] * review_file_df["image_height"]
720
+
721
+ pages_are_images = True
722
 
723
+ # Go through each row of the review_file_df, create an entry in the output Adobe xfdf file.
724
+ for _, row in review_file_df.iterrows():
725
  page_python_format = int(row["page"])-1
726
 
727
  pymupdf_page = pymupdf_doc.load_page(page_python_format)
728
 
729
+ # Load cropbox sizes. Set cropbox to the original cropbox sizes from when the document was loaded into the app.
730
  if document_cropboxes:
731
  #print("Document cropboxes:", document_cropboxes)
732
 
 
741
  else:
742
  print("Document cropboxes not found.")
743
 
744
+
745
  pdf_page_height = pymupdf_page.mediabox.height
746
  pdf_page_width = pymupdf_page.mediabox.width
747
 
748
  image = image_paths[page_python_format]
749
 
 
 
750
  if isinstance(image, str):
751
  image = Image.open(image)
752
 
 
763
  redact_annot.set('page', str(int(row['page']) - 1))
764
 
765
  # Convert coordinates
766
+ if pages_are_images == True:
767
+ x1, y1, x2, y2 = convert_image_coords_to_adobe(
768
+ pdf_page_width,
769
+ pdf_page_height,
770
+ image_page_width,
771
+ image_page_height,
772
+ row['xmin'],
773
+ row['ymin'],
774
+ row['xmax'],
775
+ row['ymax']
776
+ )
777
+ else:
778
+ x1, y1, x2, y2 = convert_pymupdf_coords_to_adobe(row['xmin'],
779
+ row['ymin'],
780
+ row['xmax'],
781
+ row['ymax'])
782
 
783
  if CUSTOM_BOX_COLOUR == "grey":
784
  colour_str = "0.5,0.5,0.5"
 
830
 
831
  return xml_str
832
 
833
+ def convert_df_to_xfdf(input_files:List[str], pdf_doc:Document, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[], page_sizes:List[dict]=[]):
834
  '''
835
  Load in files to convert a review file into an Adobe comment file format
836
  '''
837
  output_paths = []
838
  pdf_name = ""
839
+ file_path_name = ""
840
 
841
  if isinstance(input_files, str):
842
  file_paths_list = [input_files]
 
853
  else:
854
  file_path = file.name
855
 
856
+ file_path_name = get_file_name_without_type(file_path)
857
+ file_path_end = detect_file_type(file_path)
858
 
859
+ if file_path_end == "pdf":
860
+ pdf_name = os.path.basename(file_path)
861
 
862
+ if file_path_end == "csv":
863
+ # If no pdf name, just get the name of the file path
864
+ if not pdf_name:
865
+ pdf_name = file_path_name
866
+ # Read CSV file
867
+ review_file_df = pd.read_csv(file_path)
868
 
869
+ review_file_df.fillna('', inplace=True) # Replace NaN in review file with an empty string
870
 
871
+ xfdf_content = create_xfdf(review_file_df, pdf_name, pdf_doc, image_paths, document_cropboxes, page_sizes)
872
 
873
+ output_path = output_folder + file_path_name + "_adobe.xfdf"
874
+
875
+ with open(output_path, 'w', encoding='utf-8') as f:
876
+ f.write(xfdf_content)
877
 
878
+ output_paths.append(output_path)
879
 
880
  return output_paths
881
 
 
916
 
917
  return image_x1, image_y1, image_x2, image_y2
918
 
919
+ def parse_xfdf(xfdf_path:str):
920
  '''
921
  Parse the XFDF file and extract redaction annotations.
922