seanpedrickcase commited on
Commit
08a3ec3
·
1 Parent(s): c6b043a

Now redact on whole PDF mediabox size (larger than viewable size sometimes), then converted back to cropbox size for print and Adobe review. Improved some error raising and app flow

Browse files
app.py CHANGED
@@ -83,6 +83,7 @@ with app:
83
  do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
84
 
85
  prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
 
86
  images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
87
 
88
  output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
@@ -121,7 +122,7 @@ with app:
121
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
122
 
123
  ## Annotator zoom value
124
- annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=80, precision=0, visible=False)
125
  zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
126
  zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
127
 
@@ -212,22 +213,21 @@ with app:
212
  with gr.Accordion(label = "Review redaction file", open=True):
213
  output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
214
  upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)", variant="primary")
215
-
 
 
 
 
 
 
216
  with gr.Row():
217
  annotation_last_page_button = gr.Button("Previous page", scale = 3)
218
  annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
219
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
220
  annotation_next_page_button = gr.Button("Next page", scale = 3)
221
- with gr.Row():
222
- annotate_zoom_in = gr.Button("Zoom in")
223
- annotate_zoom_out = gr.Button("Zoom out")
224
- with gr.Row():
225
- annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
226
- with gr.Row():
227
- clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
228
 
229
  with gr.Row():
230
- with gr.Column(scale=1):
231
 
232
  zoom_str = str(annotator_zoom_number) + '%'
233
 
@@ -248,6 +248,10 @@ with app:
248
  handles_cursor=True,
249
  interactive=False
250
  )
 
 
 
 
251
 
252
  with gr.Row():
253
  annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
@@ -255,15 +259,12 @@ with app:
255
  annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
256
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
257
 
258
- #with gr.Column(scale=1):
259
- with gr.Row():
260
- recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
261
- recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
262
 
263
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
264
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
265
  adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
266
- convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="primary")
267
 
268
  ###
269
  # TEXT / TABULAR DATA TAB
@@ -369,19 +370,19 @@ with app:
369
  ###
370
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
371
 
372
- document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
373
- then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
374
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
375
- then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
376
 
377
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
378
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
379
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
380
- then(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
381
 
382
  # If a file has been completed, the function will continue onto the next document
383
  latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
384
- then(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
385
 
386
  ###
387
  # REVIEW PDF REDACTIONS
@@ -389,85 +390,85 @@ with app:
389
 
390
  # Upload previous files for modifying redactions
391
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
392
- then(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
393
- then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
394
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
395
 
396
  # Page controls at top
397
  annotate_current_page.submit(
398
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
399
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
400
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
401
 
402
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
403
- then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
404
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
405
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
406
 
407
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
408
- then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
409
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
410
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
411
 
412
  # Zoom in and out on annotator
413
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
414
- then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
415
 
416
  annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
417
- then(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
418
 
419
  annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
420
 
421
  clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
422
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
423
 
424
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
425
 
426
  # Page controls at bottom
427
  annotate_current_page_bottom.submit(
428
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
429
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
430
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
431
 
432
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
433
- then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
434
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
435
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
436
 
437
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
438
- then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
439
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
440
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
441
 
442
  # Review table controls
443
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
444
 
445
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
446
- then(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
447
- then(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
448
- then(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
449
 
450
  # Convert review file to xfdf Adobe format
451
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
452
- then(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
453
- then(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[adobe_review_files_out])
454
 
455
  # Convert xfdf Adobe file back to review_file.csv
456
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
457
- then(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state]).\
458
- then(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
459
 
460
  ###
461
  # TABULAR DATA REDACTION
462
  ###
463
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
464
- then(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
465
 
466
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
467
 
468
  # If the output file count text box changes, keep going with redacting each data file until done
469
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
470
- then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
471
 
472
  ###
473
  # IDENTIFY DUPLICATE PAGES
@@ -500,7 +501,7 @@ with app:
500
  # print("default_allow_list_output_folder_location:", default_allow_list_loc)
501
  # if not os.path.exists(default_allow_list_loc):
502
  # app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
503
- # then(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
504
  # else:
505
  # app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
506
 
@@ -508,25 +509,25 @@ with app:
508
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
509
  access_callback.setup([session_hash_textbox], access_logs_folder)
510
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
511
- then(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
512
 
513
  # User submitted feedback for pdf redactions
514
  pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
515
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
516
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
517
- then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
518
 
519
  # User submitted feedback for data redactions
520
  data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
521
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], feedback_logs_folder)
522
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
523
- then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
524
 
525
  # Log processing time/token usage when making a query
526
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
527
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
528
  latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
529
- then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
530
 
531
  # Get some environment variables and Launch the Gradio app
532
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 
83
  do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
84
 
85
  prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
86
+ document_cropboxes = gr.Dropdown(label = "document_cropboxes", value="", allow_custom_value=True,visible=False)
87
  images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
88
 
89
  output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
 
122
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
123
 
124
  ## Annotator zoom value
125
+ annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
126
  zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
127
  zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
128
 
 
213
  with gr.Accordion(label = "Review redaction file", open=True):
214
  output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
215
  upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)", variant="primary")
216
+ with gr.Row():
217
+ annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
218
+ with gr.Row():
219
+ annotate_zoom_in = gr.Button("Zoom in", visible=False)
220
+ annotate_zoom_out = gr.Button("Zoom out", visible=False)
221
+ with gr.Row():
222
+ clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
223
  with gr.Row():
224
  annotation_last_page_button = gr.Button("Previous page", scale = 3)
225
  annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
226
  annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
227
  annotation_next_page_button = gr.Button("Next page", scale = 3)
 
 
 
 
 
 
 
228
 
229
  with gr.Row():
230
+ with gr.Column(scale=3):
231
 
232
  zoom_str = str(annotator_zoom_number) + '%'
233
 
 
248
  handles_cursor=True,
249
  interactive=False
250
  )
251
+ with gr.Column(scale=1):
252
+ #with gr.Row():
253
+ recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
254
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=(2,"fixed"), type="pandas", label="Search results. Click to go to page")
255
 
256
  with gr.Row():
257
  annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
 
259
  annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
260
  annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
261
 
262
+
 
 
 
263
 
264
  with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
265
  convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
266
  adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
267
+ convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="secondary")
268
 
269
  ###
270
  # TEXT / TABULAR DATA TAB
 
370
  ###
371
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
372
 
373
+ document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
374
+ success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
375
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
376
+ success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
377
 
378
  # If the app has completed a batch of pages, it will run this until the end of all pages in the document
379
  current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
380
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
381
+ success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
382
 
383
  # If a file has been completed, the function will continue onto the next document
384
  latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
385
+ success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
386
 
387
  ###
388
  # REVIEW PDF REDACTIONS
 
390
 
391
  # Upload previous files for modifying redactions
392
  upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
393
+ success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
394
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes], api_name="prepare_doc").\
395
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
396
 
397
  # Page controls at top
398
  annotate_current_page.submit(
399
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
400
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
401
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
402
 
403
  annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
404
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
405
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
406
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
407
 
408
  annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
409
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
410
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
411
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
412
 
413
  # Zoom in and out on annotator
414
  annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
415
+ success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
416
 
417
  annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
418
+ success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
419
 
420
  annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
421
 
422
  clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
423
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
424
 
425
  annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
426
 
427
  # Page controls at bottom
428
  annotate_current_page_bottom.submit(
429
  modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
430
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
431
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
432
 
433
  annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
434
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
435
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
436
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
437
 
438
  annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
439
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
440
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
441
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
442
 
443
  # Review table controls
444
  recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
445
 
446
  recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
447
+ success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
448
+ success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
449
+ success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
450
 
451
  # Convert review file to xfdf Adobe format
452
  convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
453
+ success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes]).\
454
+ success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
455
 
456
  # Convert xfdf Adobe file back to review_file.csv
457
  convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
458
+ success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes]).\
459
+ success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
460
 
461
  ###
462
  # TABULAR DATA REDACTION
463
  ###
464
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
465
+ success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
466
 
467
  tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
468
 
469
  # If the output file count text box changes, keep going with redacting each data file until done
470
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
471
+ success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
472
 
473
  ###
474
  # IDENTIFY DUPLICATE PAGES
 
501
  # print("default_allow_list_output_folder_location:", default_allow_list_loc)
502
  # if not os.path.exists(default_allow_list_loc):
503
  # app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
504
+ # success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
505
  # else:
506
  # app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
507
 
 
509
  access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
510
  access_callback.setup([session_hash_textbox], access_logs_folder)
511
  session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
512
+ success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
513
 
514
  # User submitted feedback for pdf redactions
515
  pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
516
  pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
517
  pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
518
+ success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
519
 
520
  # User submitted feedback for data redactions
521
  data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
522
  data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], feedback_logs_folder)
523
  data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
524
+ success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
525
 
526
  # Log processing time/token usage when making a query
527
  usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
528
  usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
529
  latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
530
+ success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
531
 
532
  # Get some environment variables and Launch the Gradio app
533
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
tools/file_conversion.py CHANGED
@@ -70,7 +70,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
70
  else:
71
  # Convert PDF page to image
72
  image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
73
- dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
74
  image = image_l[0]
75
  image = image.convert("L")
76
  image.save(out_path, format="PNG")
@@ -139,59 +139,6 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
139
  return images
140
 
141
 
142
- # def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
143
-
144
- # print("pdf_path in convert_pdf_to_images:", pdf_path)
145
-
146
- # # Get the number of pages in the PDF
147
- # page_count = pdfinfo_from_path(pdf_path)['Pages']
148
- # print("Number of pages in PDF: ", str(page_count))
149
-
150
- # images = []
151
-
152
- # # Open the PDF file
153
- # #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
154
- # for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
155
-
156
- # #print("page_num in convert_pdf_to_images:", page_num)
157
-
158
- # print("Converting page: ", str(page_num + 1))
159
-
160
- # # Convert one page to image
161
- # out_path = pdf_path + "_" + str(page_num) + ".png"
162
-
163
- # # Ensure the directory exists
164
- # os.makedirs(os.path.dirname(out_path), exist_ok=True)
165
-
166
- # # Check if the image already exists
167
- # if os.path.exists(out_path):
168
- # #print(f"Loading existing image from {out_path}.")
169
- # image = Image.open(out_path) # Load the existing image
170
-
171
- # else:
172
- # image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
173
-
174
- # image = image_l[0]
175
-
176
- # # Convert to greyscale
177
- # image = image.convert("L")
178
-
179
- # image.save(out_path, format="PNG") # Save the new image
180
-
181
- # # If no images are returned, break the loop
182
- # if not image:
183
- # print("Conversion of page", str(page_num), "to file failed.")
184
- # break
185
-
186
- # # print("Conversion of page", str(page_num), "to file succeeded.")
187
- # # print("image:", image)
188
-
189
- # images.append(out_path)
190
-
191
- # print("PDF has been converted to images.")
192
- # # print("Images:", images)
193
-
194
- # return images
195
 
196
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
197
  def process_file(file_path:str, prepare_for_review:bool=False):
@@ -304,71 +251,6 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
304
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
305
  shape.commit()
306
 
307
- # def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
308
- # '''
309
- # Converts coordinates from pymupdf format to image coordinates,
310
- # accounting for mediabox dimensions and offset.
311
- # '''
312
- # # Get rect dimensions
313
- # rect = pymupdf_page.rect
314
- # rect_width = rect.width
315
- # rect_height = rect.height
316
-
317
- # # Get mediabox dimensions and position
318
- # mediabox = pymupdf_page.mediabox
319
- # mediabox_width = mediabox.width
320
- # mediabox_height = mediabox.height
321
-
322
- # # Get target image dimensions
323
- # image_page_width, image_page_height = image.size
324
-
325
- # # Calculate scaling factors
326
- # image_to_mediabox_x_scale = image_page_width / mediabox_width
327
- # image_to_mediabox_y_scale = image_page_height / mediabox_height
328
-
329
- # image_to_rect_scale_width = image_page_width / rect_width
330
- # image_to_rect_scale_height = image_page_height / rect_height
331
-
332
- # # Adjust for offsets (difference in position between mediabox and rect)
333
- # x_offset = rect.x0 - mediabox.x0 # Difference in x position
334
- # y_offset = rect.y0 - mediabox.y0 # Difference in y position
335
-
336
- # print("x_offset:", x_offset)
337
- # print("y_offset:", y_offset)
338
-
339
- # # Adjust coordinates:
340
- # # Apply scaling to match image dimensions
341
- # x1_image = x1 * image_to_mediabox_x_scale
342
- # x2_image = x2 * image_to_mediabox_x_scale
343
- # y1_image = y1 * image_to_mediabox_y_scale
344
- # y2_image = y2 * image_to_mediabox_y_scale
345
-
346
- # # Correct for difference in rect and mediabox size
347
- # if mediabox_width != rect_width:
348
-
349
- # mediabox_to_rect_x_scale = mediabox_width / rect_width
350
- # mediabox_to_rect_y_scale = mediabox_height / rect_height
351
-
352
- # x1_image *= mediabox_to_rect_x_scale
353
- # x2_image *= mediabox_to_rect_x_scale
354
- # y1_image *= mediabox_to_rect_y_scale
355
- # y2_image *= mediabox_to_rect_y_scale
356
-
357
- # print("mediabox_to_rect_x_scale:", mediabox_to_rect_x_scale)
358
- # #print("mediabox_to_rect_y_scale:", mediabox_to_rect_y_scale)
359
-
360
- # print("image_to_mediabox_x_scale:", image_to_mediabox_x_scale)
361
- # #print("image_to_mediabox_y_scale:", image_to_mediabox_y_scale)
362
-
363
- # mediabox_rect_x_diff = (mediabox_width - rect_width) * 2
364
- # mediabox_rect_y_diff = (mediabox_height - rect_height) * 2
365
-
366
- # x1_image -= mediabox_rect_x_diff
367
- # x2_image -= mediabox_rect_x_diff
368
- # y1_image += mediabox_rect_y_diff
369
- # y2_image += mediabox_rect_y_diff
370
-
371
- # return x1_image, y1_image, x2_image, y2_image
372
 
373
  def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
374
  '''
@@ -434,8 +316,6 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
434
 
435
  return x1_image, y1_image, x2_image, y2_image
436
 
437
-
438
-
439
  def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
440
  # Small border to page that remains white
441
  border = 5
@@ -498,6 +378,7 @@ def prepare_image_or_pdf(
498
 
499
  tic = time.perf_counter()
500
  json_from_csv = False
 
501
 
502
  if isinstance(in_fully_redacted_list, pd.DataFrame):
503
  in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
@@ -586,14 +467,19 @@ def prepare_image_or_pdf(
586
  if not file_path:
587
  out_message = "Please select a file."
588
  print(out_message)
589
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
590
-
591
  file_extension = os.path.splitext(file_path)[1].lower()
592
 
593
  # If a pdf, load as a pymupdf document
594
  if is_pdf(file_path):
595
  pymupdf_doc = pymupdf.open(file_path)
596
 
 
 
 
 
 
597
  converted_file_path = file_path
598
  image_file_paths = process_file(file_path, prepare_for_review)
599
 
@@ -737,13 +623,13 @@ def prepare_image_or_pdf(
737
  if is_pdf_or_image(file_path) == False:
738
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
739
  print(out_message)
740
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
741
 
742
  elif in_redact_method == text_ocr_option:
743
  if is_pdf(file_path) == False:
744
  out_message = "Please upload a PDF file for text analysis."
745
  print(out_message)
746
- return out_message, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
747
 
748
 
749
  converted_file_paths.append(converted_file_path)
@@ -759,7 +645,7 @@ def prepare_image_or_pdf(
759
 
760
  number_of_pages = len(image_file_paths)
761
 
762
- return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
763
 
764
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
765
  file_path_without_ext = get_file_name_without_type(in_file_path)
 
70
  else:
71
  # Convert PDF page to image
72
  image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
73
+ dpi=image_dpi, use_cropbox=False, use_pdftocairo=False)
74
  image = image_l[0]
75
  image = image.convert("L")
76
  image.save(out_path, format="PNG")
 
139
  return images
140
 
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  # Function to take in a file path, decide if it is an image or pdf, then process appropriately.
144
  def process_file(file_path:str, prepare_for_review:bool=False):
 
251
  #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
252
  shape.commit()
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
 
255
  def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
256
  '''
 
316
 
317
  return x1_image, y1_image, x2_image, y2_image
318
 
 
 
319
  def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
320
  # Small border to page that remains white
321
  border = 5
 
378
 
379
  tic = time.perf_counter()
380
  json_from_csv = False
381
+ original_cropboxes = [] # Store original CropBox values
382
 
383
  if isinstance(in_fully_redacted_list, pd.DataFrame):
384
  in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
 
467
  if not file_path:
468
  out_message = "Please select a file."
469
  print(out_message)
470
+ raise Exception(out_message)
471
+
472
  file_extension = os.path.splitext(file_path)[1].lower()
473
 
474
  # If a pdf, load as a pymupdf document
475
  if is_pdf(file_path):
476
  pymupdf_doc = pymupdf.open(file_path)
477
 
478
+ # Load cropbox dimensions to use later
479
+
480
+ for page in pymupdf_doc:
481
+ original_cropboxes.append(page.cropbox) # Save original CropBox
482
+
483
  converted_file_path = file_path
484
  image_file_paths = process_file(file_path, prepare_for_review)
485
 
 
623
  if is_pdf_or_image(file_path) == False:
624
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
625
  print(out_message)
626
+ raise Exception(out_message)
627
 
628
  elif in_redact_method == text_ocr_option:
629
  if is_pdf(file_path) == False:
630
  out_message = "Please upload a PDF file for text analysis."
631
  print(out_message)
632
+ raise Exception(out_message)
633
 
634
 
635
  converted_file_paths.append(converted_file_path)
 
645
 
646
  number_of_pages = len(image_file_paths)
647
 
648
+ return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes
649
 
650
  def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
651
  file_path_without_ext = get_file_name_without_type(in_file_path)
tools/file_redaction.py CHANGED
@@ -102,6 +102,7 @@ def choose_and_run_redactor(file_paths:List[str],
102
  annotate_max_pages:int=1,
103
  review_file_state=[],
104
  output_folder:str=output_folder,
 
105
  progress=gr.Progress(track_tqdm=True)):
106
  '''
107
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
@@ -140,6 +141,7 @@ def choose_and_run_redactor(file_paths:List[str],
140
  - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
141
  - annotate_max_pages (int, optional): Maximum page value for the annotation object
142
  - output_folder (str, optional): Output folder for results.
 
143
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
144
 
145
  The function returns a redacted document along with processing logs.
@@ -150,10 +152,8 @@ def choose_and_run_redactor(file_paths:List[str],
150
 
151
  # If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
152
  if not pymupdf_doc:
153
- print("Prepared PDF file not found, running prepare_image_or_pdf function")
154
- out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages)
155
-
156
- annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
157
 
158
  #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
159
  review_out_file_paths = [prepared_pdf_file_paths[0]]
@@ -183,7 +183,6 @@ def choose_and_run_redactor(file_paths:List[str],
183
  out_file_paths = []
184
  estimate_total_processing_time = 0
185
  estimated_time_taken_state = 0
186
-
187
  # If not the first time around, and the current page loop has been set to a huge number (been through all pages), reset current page to 0
188
  elif (first_loop_state == False) & (current_loop_page == 999):
189
  current_loop_page = 0
@@ -200,12 +199,10 @@ def choose_and_run_redactor(file_paths:List[str],
200
  else:
201
  number_of_files = len(file_paths)
202
 
203
- # If we have already redacted the last file, return the input out_message and file list to the relevant components
204
  if latest_file_completed >= number_of_files:
205
 
206
  print("Completed last file")
207
- # Set to a very high number so as not to mix up with subsequent file processing by the user
208
- # latest_file_completed = 99
209
  current_loop_page = 0
210
 
211
  if isinstance(out_message, list):
@@ -224,7 +221,7 @@ def choose_and_run_redactor(file_paths:List[str],
224
 
225
  return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
226
 
227
- # If we have reached the last page, return message
228
  if current_loop_page >= number_of_pages:
229
  print("Reached last page of document:", current_loop_page)
230
 
@@ -273,7 +270,8 @@ def choose_and_run_redactor(file_paths:List[str],
273
  comprehend_client = ""
274
  out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
275
  print(out_message)
276
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
 
277
  else:
278
  comprehend_client = ""
279
 
@@ -296,7 +294,8 @@ def choose_and_run_redactor(file_paths:List[str],
296
  textract_client = ""
297
  out_message = "Cannot connect to AWS Textract. Please provide access keys under Textract settings on the Redaction settings tab,choose another text extraction method."
298
  print(out_message)
299
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
 
300
  else:
301
  textract_client = ""
302
 
@@ -336,15 +335,14 @@ def choose_and_run_redactor(file_paths:List[str],
336
  else:
337
  out_message = "No file selected"
338
  print(out_message)
339
-
340
- return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
341
 
342
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
343
 
344
  #Analyse and redact image-based pdf or image
345
  if is_pdf_or_image(file_path) == False:
346
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
347
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
348
 
349
  print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
350
 
@@ -383,12 +381,10 @@ def choose_and_run_redactor(file_paths:List[str],
383
  all_request_metadata.append(new_request_metadata)
384
 
385
  elif in_redact_method == text_ocr_option:
386
-
387
- #log_files_output_paths = []
388
 
389
  if is_pdf(file_path) == False:
390
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
391
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
392
 
393
  # Analyse text-based pdf
394
  print('Redacting file as text-based PDF')
@@ -418,7 +414,7 @@ def choose_and_run_redactor(file_paths:List[str],
418
  else:
419
  out_message = "No redaction method selected"
420
  print(out_message)
421
- return out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
422
 
423
  # If at last page, save to file
424
  if current_loop_page >= number_of_pages:
@@ -434,8 +430,7 @@ def choose_and_run_redactor(file_paths:List[str],
434
  if is_pdf(file_path) == False:
435
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
436
  pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
437
- out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
438
-
439
  else:
440
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
441
  pymupdf_doc.save(out_redacted_pdf_file_path)
@@ -678,12 +673,13 @@ def move_page_info(file_path: str) -> str:
678
 
679
  return new_file_path
680
 
681
- def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_coords:bool=True):
682
 
683
  mediabox_height = page.mediabox[3] - page.mediabox[1]
684
  mediabox_width = page.mediabox[2] - page.mediabox[0]
685
  rect_height = page.rect.height
686
- rect_width = page.rect.width
 
687
 
688
  pymupdf_x1 = None
689
  pymupdf_x2 = None
@@ -801,6 +797,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
801
  }
802
 
803
  page.apply_redactions(images=0, graphics=0)
 
804
  page.clean_contents()
805
 
806
  return page, out_annotation_boxes
@@ -1003,9 +1000,10 @@ def redact_image_pdf(file_path:str,
1003
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
1004
 
1005
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1006
- print("Connection to AWS Comprehend service unsuccessful.")
 
 
1007
 
1008
- return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
1009
 
1010
  if analysis_type == textract_option and textract_client == "":
1011
  print("Connection to AWS Textract service unsuccessful.")
@@ -1057,6 +1055,8 @@ def redact_image_pdf(file_path:str,
1057
 
1058
  progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1059
 
 
 
1060
  for page_no in progress_bar:
1061
 
1062
  handwriting_or_signature_boxes = []
@@ -1076,6 +1076,9 @@ def redact_image_pdf(file_path:str,
1076
 
1077
  image_annotations = {"image": image, "boxes": []}
1078
  pymupdf_page = pymupdf_doc.load_page(page_no)
 
 
 
1079
 
1080
  if page_no >= page_min and page_no < page_max:
1081
 
@@ -1219,7 +1222,7 @@ def redact_image_pdf(file_path:str,
1219
  else: redact_whole_page = False
1220
  else: redact_whole_page = False
1221
 
1222
- pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image, redact_whole_page=redact_whole_page)
1223
 
1224
  # Convert decision process to table
1225
  decision_process_table = pd.DataFrame([{
@@ -1596,6 +1599,8 @@ def redact_text_pdf(
1596
  if current_loop_page == 0: page_loop_start = 0
1597
  else: page_loop_start = current_loop_page
1598
 
 
 
1599
  progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
1600
 
1601
  #for page_no in range(0, number_of_pages):
@@ -1615,6 +1620,9 @@ def redact_text_pdf(
1615
  image_annotations = {"image": image, "boxes": []}
1616
  pymupdf_page = pymupdf_doc.load_page(page_no)
1617
 
 
 
 
1618
  if page_min <= page_no < page_max:
1619
 
1620
  if isinstance(image, str):
@@ -1701,15 +1709,15 @@ def redact_text_pdf(
1701
  else: redact_whole_page = False
1702
  else: redact_whole_page = False
1703
 
1704
- pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
1705
 
1706
  reported_page_no = page_no + 1
1707
  print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
1708
 
1709
  # Join extracted text outputs for all lines together
1710
  if not page_text_ocr_outputs.empty:
1711
- page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
1712
- all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_ocr_outputs])
1713
 
1714
  # Write logs
1715
  # Create decision process table
 
102
  annotate_max_pages:int=1,
103
  review_file_state=[],
104
  output_folder:str=output_folder,
105
+ document_cropboxes:List=[],
106
  progress=gr.Progress(track_tqdm=True)):
107
  '''
108
  This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
 
141
  - aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
142
  - annotate_max_pages (int, optional): Maximum page value for the annotation object
143
  - output_folder (str, optional): Output folder for results.
144
+ - document_cropboxes (List, optional): List of document cropboxes for the PDF.
145
  - progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
146
 
147
  The function returns a redacted document along with processing logs.
 
152
 
153
  # If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
154
  if not pymupdf_doc:
155
+ print("Prepared PDF file not found, loading from file")
156
+ out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages, document_cropboxes)
 
 
157
 
158
  #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
159
  review_out_file_paths = [prepared_pdf_file_paths[0]]
 
183
  out_file_paths = []
184
  estimate_total_processing_time = 0
185
  estimated_time_taken_state = 0
 
186
  # If not the first time around, and the current page loop has been set to a huge number (been through all pages), reset current page to 0
187
  elif (first_loop_state == False) & (current_loop_page == 999):
188
  current_loop_page = 0
 
199
  else:
200
  number_of_files = len(file_paths)
201
 
202
+ # If we have already redacted the last file, return the input out_message and file list to the relevant outputs
203
  if latest_file_completed >= number_of_files:
204
 
205
  print("Completed last file")
 
 
206
  current_loop_page = 0
207
 
208
  if isinstance(out_message, list):
 
221
 
222
  return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
223
 
224
+ # If we have reached the last page, return message and outputs
225
  if current_loop_page >= number_of_pages:
226
  print("Reached last page of document:", current_loop_page)
227
 
 
270
  comprehend_client = ""
271
  out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
272
  print(out_message)
273
+ raise Exception(out_message)
274
+
275
  else:
276
  comprehend_client = ""
277
 
 
294
  textract_client = ""
295
  out_message = "Cannot connect to AWS Textract. Please provide access keys under Textract settings on the Redaction settings tab,choose another text extraction method."
296
  print(out_message)
297
+ raise Exception(out_message)
298
+
299
  else:
300
  textract_client = ""
301
 
 
335
  else:
336
  out_message = "No file selected"
337
  print(out_message)
338
+ raise Exception(out_message)
 
339
 
340
  if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
341
 
342
  #Analyse and redact image-based pdf or image
343
  if is_pdf_or_image(file_path) == False:
344
  out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
345
+ raise Exception(out_message)
346
 
347
  print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
348
 
 
381
  all_request_metadata.append(new_request_metadata)
382
 
383
  elif in_redact_method == text_ocr_option:
 
 
384
 
385
  if is_pdf(file_path) == False:
386
  out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
387
+ raise Exception(out_message)
388
 
389
  # Analyse text-based pdf
390
  print('Redacting file as text-based PDF')
 
414
  else:
415
  out_message = "No redaction method selected"
416
  print(out_message)
417
+ raise Exception(out_message)
418
 
419
  # If at last page, save to file
420
  if current_loop_page >= number_of_pages:
 
430
  if is_pdf(file_path) == False:
431
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
432
  pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
433
+ out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
 
434
  else:
435
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
436
  pymupdf_doc.save(out_redacted_pdf_file_path)
 
673
 
674
  return new_file_path
675
 
676
+ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_coords:bool=True, original_cropbox=[]):
677
 
678
  mediabox_height = page.mediabox[3] - page.mediabox[1]
679
  mediabox_width = page.mediabox[2] - page.mediabox[0]
680
  rect_height = page.rect.height
681
+ rect_width = page.rect.width
682
+
683
 
684
  pymupdf_x1 = None
685
  pymupdf_x2 = None
 
797
  }
798
 
799
  page.apply_redactions(images=0, graphics=0)
800
+ page.set_cropbox(original_cropbox) # Set CropBox to original size
801
  page.clean_contents()
802
 
803
  return page, out_annotation_boxes
 
1000
  image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
1001
 
1002
  if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
1003
+ out_message = "Connection to AWS Comprehend service unsuccessful."
1004
+ print(out_message)
1005
+ raise Exception(out_message)
1006
 
 
1007
 
1008
  if analysis_type == textract_option and textract_client == "":
1009
  print("Connection to AWS Textract service unsuccessful.")
 
1055
 
1056
  progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
1057
 
1058
+ original_cropboxes = []
1059
+
1060
  for page_no in progress_bar:
1061
 
1062
  handwriting_or_signature_boxes = []
 
1076
 
1077
  image_annotations = {"image": image, "boxes": []}
1078
  pymupdf_page = pymupdf_doc.load_page(page_no)
1079
+
1080
+ original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
1081
+ pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
1082
 
1083
  if page_no >= page_min and page_no < page_max:
1084
 
 
1222
  else: redact_whole_page = False
1223
  else: redact_whole_page = False
1224
 
1225
+ pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image, redact_whole_page=redact_whole_page, original_cropbox=original_cropboxes[-1])
1226
 
1227
  # Convert decision process to table
1228
  decision_process_table = pd.DataFrame([{
 
1599
  if current_loop_page == 0: page_loop_start = 0
1600
  else: page_loop_start = current_loop_page
1601
 
1602
+ original_cropboxes = []
1603
+
1604
  progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
1605
 
1606
  #for page_no in range(0, number_of_pages):
 
1620
  image_annotations = {"image": image, "boxes": []}
1621
  pymupdf_page = pymupdf_doc.load_page(page_no)
1622
 
1623
+ original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
1624
+ pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
1625
+
1626
  if page_min <= page_no < page_max:
1627
 
1628
  if isinstance(image, str):
 
1709
  else: redact_whole_page = False
1710
  else: redact_whole_page = False
1711
 
1712
+ pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False, original_cropbox=original_cropboxes[-1])
1713
 
1714
  reported_page_no = page_no + 1
1715
  print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
1716
 
1717
  # Join extracted text outputs for all lines together
1718
  if not page_text_ocr_outputs.empty:
1719
+ page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
1720
+ all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_ocr_outputs])
1721
 
1722
  # Write logs
1723
  # Create decision process table
tools/helper_functions.py CHANGED
@@ -60,7 +60,7 @@ def reset_state_vars():
60
  show_share_button=False,
61
  show_remove_button=False,
62
  interactive=False
63
- ), [], [], [], pd.DataFrame(), pd.DataFrame()
64
 
65
  def reset_review_vars():
66
  return [], pd.DataFrame(), pd.DataFrame()
 
60
  show_share_button=False,
61
  show_remove_button=False,
62
  interactive=False
63
+ ), [], [], [], pd.DataFrame(), pd.DataFrame(), []
64
 
65
  def reset_review_vars():
66
  return [], pd.DataFrame(), pd.DataFrame()
tools/redaction_review.py CHANGED
@@ -12,8 +12,9 @@ from tools.helper_functions import get_file_name_without_type, output_folder, de
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
14
  import os
 
15
  import pymupdf
16
- from fitz import Document
17
  from PIL import ImageDraw, Image
18
  from collections import defaultdict
19
 
@@ -431,7 +432,7 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
431
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
432
 
433
 
434
- def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str]):
435
  '''
436
  Create an xfdf file from a review csv file and a pdf
437
  '''
@@ -451,8 +452,23 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
451
 
452
  pymupdf_page = pymupdf_doc.load_page(page_python_format)
453
 
454
- pdf_page_height = pymupdf_page.rect.height
455
- pdf_page_width = pymupdf_page.rect.width
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
456
 
457
  image = image_paths[page_python_format]
458
 
@@ -535,7 +551,7 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
535
 
536
  return xml_str
537
 
538
- def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder):
539
  '''
540
  Load in files to convert a review file into an Adobe comment file format
541
  '''
@@ -572,7 +588,7 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], ou
572
 
573
  df.fillna('', inplace=True) # Replace NaN with an empty string
574
 
575
- xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
576
 
577
  output_path = output_folder + file_path_name + "_adobe.xfdf"
578
 
 
12
  from tools.file_redaction import redact_page_with_pymupdf
13
  import json
14
  import os
15
+ import re
16
  import pymupdf
17
+ from fitz import Document, Rect
18
  from PIL import ImageDraw, Image
19
  from collections import defaultdict
20
 
 
432
  return pdf_x1, pdf_y1, pdf_x2, pdf_y2
433
 
434
 
435
+ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[]):
436
  '''
437
  Create an xfdf file from a review csv file and a pdf
438
  '''
 
452
 
453
  pymupdf_page = pymupdf_doc.load_page(page_python_format)
454
 
455
+ # Load cropbox sizes
456
+ if document_cropboxes:
457
+ print("Document cropboxes:", document_cropboxes)
458
+
459
+ # Extract numbers safely using regex
460
+ match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])
461
+
462
+ if match and len(match) == 4:
463
+ rect_values = list(map(float, match)) # Convert extracted strings to floats
464
+ pymupdf_page.set_cropbox(Rect(*rect_values))
465
+ else:
466
+ raise ValueError(f"Invalid cropbox format: {document_cropboxes[page_python_format]}")
467
+ else:
468
+ print("Document cropboxes not found.")
469
+
470
+ pdf_page_height = pymupdf_page.mediabox.height
471
+ pdf_page_width = pymupdf_page.mediabox.width
472
 
473
  image = image_paths[page_python_format]
474
 
 
551
 
552
  return xml_str
553
 
554
+ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[]):
555
  '''
556
  Load in files to convert a review file into an Adobe comment file format
557
  '''
 
588
 
589
  df.fillna('', inplace=True) # Replace NaN with an empty string
590
 
591
+ xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths, document_cropboxes)
592
 
593
  output_path = output_folder + file_path_name + "_adobe.xfdf"
594