Commit
·
08a3ec3
1
Parent(s):
c6b043a
Now redact on whole PDF mediabox size (larger than viewable size sometimes), then converted back to cropbox size for print and Adobe review. Improved some error raising and app flow
Browse files- app.py +57 -56
- tools/file_conversion.py +12 -126
- tools/file_redaction.py +36 -28
- tools/helper_functions.py +1 -1
- tools/redaction_review.py +22 -6
app.py
CHANGED
@@ -83,6 +83,7 @@ with app:
|
|
83 |
do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
|
84 |
|
85 |
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
|
|
|
86 |
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
|
87 |
|
88 |
output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
|
@@ -121,7 +122,7 @@ with app:
|
|
121 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
122 |
|
123 |
## Annotator zoom value
|
124 |
-
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=
|
125 |
zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
|
126 |
zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
|
127 |
|
@@ -212,22 +213,21 @@ with app:
|
|
212 |
with gr.Accordion(label = "Review redaction file", open=True):
|
213 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
214 |
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)", variant="primary")
|
215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
with gr.Row():
|
217 |
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
218 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
219 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
220 |
annotation_next_page_button = gr.Button("Next page", scale = 3)
|
221 |
-
with gr.Row():
|
222 |
-
annotate_zoom_in = gr.Button("Zoom in")
|
223 |
-
annotate_zoom_out = gr.Button("Zoom out")
|
224 |
-
with gr.Row():
|
225 |
-
annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
|
226 |
-
with gr.Row():
|
227 |
-
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
228 |
|
229 |
with gr.Row():
|
230 |
-
with gr.Column(scale=
|
231 |
|
232 |
zoom_str = str(annotator_zoom_number) + '%'
|
233 |
|
@@ -248,6 +248,10 @@ with app:
|
|
248 |
handles_cursor=True,
|
249 |
interactive=False
|
250 |
)
|
|
|
|
|
|
|
|
|
251 |
|
252 |
with gr.Row():
|
253 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
@@ -255,15 +259,12 @@ with app:
|
|
255 |
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
256 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
257 |
|
258 |
-
|
259 |
-
with gr.Row():
|
260 |
-
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
261 |
-
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
262 |
|
263 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
264 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
265 |
adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
|
266 |
-
convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="
|
267 |
|
268 |
###
|
269 |
# TEXT / TABULAR DATA TAB
|
@@ -369,19 +370,19 @@ with app:
|
|
369 |
###
|
370 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
371 |
|
372 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
373 |
-
|
374 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
|
375 |
-
|
376 |
|
377 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
378 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
|
379 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
|
380 |
-
|
381 |
|
382 |
# If a file has been completed, the function will continue onto the next document
|
383 |
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
384 |
-
|
385 |
|
386 |
###
|
387 |
# REVIEW PDF REDACTIONS
|
@@ -389,85 +390,85 @@ with app:
|
|
389 |
|
390 |
# Upload previous files for modifying redactions
|
391 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
|
396 |
# Page controls at top
|
397 |
annotate_current_page.submit(
|
398 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
399 |
-
|
400 |
-
|
401 |
|
402 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
|
407 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
|
412 |
# Zoom in and out on annotator
|
413 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
414 |
-
|
415 |
|
416 |
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
417 |
-
|
418 |
|
419 |
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
420 |
|
421 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
422 |
-
|
423 |
|
424 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
425 |
|
426 |
# Page controls at bottom
|
427 |
annotate_current_page_bottom.submit(
|
428 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
429 |
-
|
430 |
-
|
431 |
|
432 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
|
437 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
|
442 |
# Review table controls
|
443 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
444 |
|
445 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
|
450 |
# Convert review file to xfdf Adobe format
|
451 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
452 |
-
|
453 |
-
|
454 |
|
455 |
# Convert xfdf Adobe file back to review_file.csv
|
456 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
457 |
-
|
458 |
-
|
459 |
|
460 |
###
|
461 |
# TABULAR DATA REDACTION
|
462 |
###
|
463 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
464 |
-
|
465 |
|
466 |
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
467 |
|
468 |
# If the output file count text box changes, keep going with redacting each data file until done
|
469 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
470 |
-
|
471 |
|
472 |
###
|
473 |
# IDENTIFY DUPLICATE PAGES
|
@@ -500,7 +501,7 @@ with app:
|
|
500 |
# print("default_allow_list_output_folder_location:", default_allow_list_loc)
|
501 |
# if not os.path.exists(default_allow_list_loc):
|
502 |
# app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
|
503 |
-
#
|
504 |
# else:
|
505 |
# app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
506 |
|
@@ -508,25 +509,25 @@ with app:
|
|
508 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
509 |
access_callback.setup([session_hash_textbox], access_logs_folder)
|
510 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
511 |
-
|
512 |
|
513 |
# User submitted feedback for pdf redactions
|
514 |
pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
515 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
|
516 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
517 |
-
|
518 |
|
519 |
# User submitted feedback for data redactions
|
520 |
data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
521 |
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], feedback_logs_folder)
|
522 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
|
523 |
-
|
524 |
|
525 |
# Log processing time/token usage when making a query
|
526 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
527 |
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
|
528 |
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
|
529 |
-
|
530 |
|
531 |
# Get some environment variables and Launch the Gradio app
|
532 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
|
|
83 |
do_not_save_pdf_state = gr.Checkbox(label="do_not_save_pdf_state", value=False, visible=False)
|
84 |
|
85 |
prepared_pdf_state = gr.Dropdown(label = "prepared_pdf_list", value="", allow_custom_value=True,visible=False)
|
86 |
+
document_cropboxes = gr.Dropdown(label = "document_cropboxes", value="", allow_custom_value=True,visible=False)
|
87 |
images_pdf_state = gr.Dropdown(label = "images_pdf_list", value="", allow_custom_value=True,visible=False)
|
88 |
|
89 |
output_image_files_state = gr.Dropdown(label = "output_image_files_list", value="", allow_custom_value=True,visible=False)
|
|
|
122 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
123 |
|
124 |
## Annotator zoom value
|
125 |
+
annotator_zoom_number = gr.Number(label = "Current annotator zoom level", value=100, precision=0, visible=False)
|
126 |
zoom_true_bool = gr.Checkbox(label="zoom_true_bool", value=True, visible=False)
|
127 |
zoom_false_bool = gr.Checkbox(label="zoom_false_bool", value=False, visible=False)
|
128 |
|
|
|
213 |
with gr.Accordion(label = "Review redaction file", open=True):
|
214 |
output_review_files = gr.File(label="Review output files", file_count='multiple', height=file_input_height)
|
215 |
upload_previous_review_file_btn = gr.Button("Review previously created redaction file (upload original PDF and ...review_file.csv)", variant="primary")
|
216 |
+
with gr.Row():
|
217 |
+
annotation_button_apply = gr.Button("Apply revised redactions to pdf", variant="secondary")
|
218 |
+
with gr.Row():
|
219 |
+
annotate_zoom_in = gr.Button("Zoom in", visible=False)
|
220 |
+
annotate_zoom_out = gr.Button("Zoom out", visible=False)
|
221 |
+
with gr.Row():
|
222 |
+
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
223 |
with gr.Row():
|
224 |
annotation_last_page_button = gr.Button("Previous page", scale = 3)
|
225 |
annotate_current_page = gr.Number(value=1, label="Page (press enter to change)", precision=0, scale = 2)
|
226 |
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
227 |
annotation_next_page_button = gr.Button("Next page", scale = 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
228 |
|
229 |
with gr.Row():
|
230 |
+
with gr.Column(scale=3):
|
231 |
|
232 |
zoom_str = str(annotator_zoom_number) + '%'
|
233 |
|
|
|
248 |
handles_cursor=True,
|
249 |
interactive=False
|
250 |
)
|
251 |
+
with gr.Column(scale=1):
|
252 |
+
#with gr.Row():
|
253 |
+
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
254 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=(2,"fixed"), type="pandas", label="Search results. Click to go to page")
|
255 |
|
256 |
with gr.Row():
|
257 |
annotation_last_page_button_bottom = gr.Button("Previous page", scale = 3)
|
|
|
259 |
annotate_max_pages_bottom = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 1)
|
260 |
annotation_next_page_button_bottom = gr.Button("Next page", scale = 3)
|
261 |
|
262 |
+
|
|
|
|
|
|
|
263 |
|
264 |
with gr.Accordion("Convert review files loaded above to Adobe format, or convert from Adobe format to review file", open = False):
|
265 |
convert_review_file_to_adobe_btn = gr.Button("Convert review file to Adobe comment format", variant="primary")
|
266 |
adobe_review_files_out = gr.File(label="Output Adobe comment files will appear here. If converting from .xfdf file to review_file.csv, upload the original pdf with the xfdf file here then click Convert below.", file_count='multiple', file_types=['.csv', '.xfdf', '.pdf'])
|
267 |
+
convert_adobe_to_review_file_btn = gr.Button("Convert Adobe .xfdf comment file to review_file.csv", variant="secondary")
|
268 |
|
269 |
###
|
270 |
# TEXT / TABULAR DATA TAB
|
|
|
370 |
###
|
371 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
372 |
|
373 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, document_cropboxes]).\
|
374 |
+
success(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
|
375 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state], api_name="redact_doc").\
|
376 |
+
success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
377 |
|
378 |
# If the app has completed a batch of pages, it will run this until the end of all pages in the document
|
379 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox],
|
380 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state]).\
|
381 |
+
success(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
382 |
|
383 |
# If a file has been completed, the function will continue onto the next document
|
384 |
latest_file_completed_text.change(fn=update_annotator, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
385 |
+
success(fn=reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
386 |
|
387 |
###
|
388 |
# REVIEW PDF REDACTIONS
|
|
|
390 |
|
391 |
# Upload previous files for modifying redactions
|
392 |
upload_previous_review_file_btn.click(fn=reset_review_vars, inputs=None, outputs=[recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
393 |
+
success(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
394 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes], api_name="prepare_doc").\
|
395 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
396 |
|
397 |
# Page controls at top
|
398 |
annotate_current_page.submit(
|
399 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
400 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
401 |
+
success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
402 |
|
403 |
annotation_last_page_button.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
404 |
+
success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
405 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
406 |
+
success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
407 |
|
408 |
annotation_next_page_button.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
409 |
+
success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
410 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
411 |
+
success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
412 |
|
413 |
# Zoom in and out on annotator
|
414 |
annotate_zoom_in.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
415 |
+
success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_true_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
416 |
|
417 |
annotate_zoom_out.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
418 |
+
success(update_zoom, inputs=[annotator_zoom_number, annotate_current_page, zoom_false_bool], outputs=[annotator_zoom_number, annotate_current_page])
|
419 |
|
420 |
annotator_zoom_number.change(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
421 |
|
422 |
clear_all_redactions_on_page_btn.click(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base, clear_all_page_redactions], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
423 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base])
|
424 |
|
425 |
annotation_button_apply.click(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output], scroll_to_output=True)
|
426 |
|
427 |
# Page controls at bottom
|
428 |
annotate_current_page_bottom.submit(
|
429 |
modify_existing_page_redactions, inputs = [annotator, annotate_current_page_bottom, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
430 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
431 |
+
success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
432 |
|
433 |
annotation_last_page_button_bottom.click(fn=decrease_page, inputs=[annotate_current_page], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
434 |
+
success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
435 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
436 |
+
success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
437 |
|
438 |
annotation_next_page_button_bottom.click(fn=increase_page, inputs=[annotate_current_page, all_image_annotations_state], outputs=[annotate_current_page, annotate_current_page_bottom]).\
|
439 |
+
success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
440 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
441 |
+
success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
442 |
|
443 |
# Review table controls
|
444 |
recogniser_entity_dropdown.select(update_entities_df, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs=[recogniser_entity_dataframe])
|
445 |
|
446 |
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page]).\
|
447 |
+
success(modify_existing_page_redactions, inputs = [annotator, annotate_current_page, annotate_previous_page, all_image_annotations_state, recogniser_entity_dropdown, recogniser_entity_dataframe_base], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom, recogniser_entity_dropdown, recogniser_entity_dataframe_base]).\
|
448 |
+
success(update_annotator, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
449 |
+
success(apply_redactions, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output])
|
450 |
|
451 |
# Convert review file to xfdf Adobe format
|
452 |
convert_review_file_to_adobe_btn.click(fn=get_input_file_names, inputs=[output_review_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
453 |
+
success(fn = prepare_image_or_pdf, inputs=[output_review_files, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes]).\
|
454 |
+
success(convert_df_to_xfdf, inputs=[output_review_files, pdf_doc_state, images_pdf_state, output_folder_textbox, document_cropboxes], outputs=[adobe_review_files_out])
|
455 |
|
456 |
# Convert xfdf Adobe file back to review_file.csv
|
457 |
convert_adobe_to_review_file_btn.click(fn=get_input_file_names, inputs=[adobe_review_files_out], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list]).\
|
458 |
+
success(fn = prepare_image_or_pdf, inputs=[adobe_review_files_out, in_redaction_method, latest_file_completed_text, output_summary, second_loop_state, annotate_max_pages, all_image_annotations_state, prepare_for_review_bool], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state, document_cropboxes]).\
|
459 |
+
success(fn=convert_xfdf_to_dataframe, inputs=[adobe_review_files_out, pdf_doc_state, images_pdf_state, output_folder_textbox], outputs=[output_review_files], scroll_to_output=True)
|
460 |
|
461 |
###
|
462 |
# TABULAR DATA REDACTION
|
463 |
###
|
464 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets]).\
|
465 |
+
success(fn=get_input_file_names, inputs=[in_data_files], outputs=[data_file_name_no_extension_textbox, data_file_name_with_extension_textbox, data_full_file_name_textbox, data_file_name_textbox_list])
|
466 |
|
467 |
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_data")
|
468 |
|
469 |
# If the output file count text box changes, keep going with redacting each data file until done
|
470 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state, output_folder_textbox, in_deny_list_state, max_fuzzy_spelling_mistakes_num, pii_identification_method_drop_tabular, in_redact_comprehend_entities, comprehend_query_number, aws_access_key_textbox, aws_secret_key_textbox], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
471 |
+
success(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
472 |
|
473 |
###
|
474 |
# IDENTIFY DUPLICATE PAGES
|
|
|
501 |
# print("default_allow_list_output_folder_location:", default_allow_list_loc)
|
502 |
# if not os.path.exists(default_allow_list_loc):
|
503 |
# app.load(download_file_from_s3, inputs=[s3_default_bucket, s3_default_allow_list_file, default_allow_list_output_folder_location]).\
|
504 |
+
# success(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
505 |
# else:
|
506 |
# app.load(load_in_default_allow_list, inputs = [default_allow_list_output_folder_location], outputs=[in_allow_list])
|
507 |
|
|
|
509 |
access_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
510 |
access_callback.setup([session_hash_textbox], access_logs_folder)
|
511 |
session_hash_textbox.change(lambda *args: access_callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
512 |
+
success(fn = upload_file_to_s3, inputs=[access_logs_state, access_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
513 |
|
514 |
# User submitted feedback for pdf redactions
|
515 |
pdf_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
516 |
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], feedback_logs_folder)
|
517 |
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, doc_file_name_no_extension_textbox], None, preprocess=False).\
|
518 |
+
success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
519 |
|
520 |
# User submitted feedback for data redactions
|
521 |
data_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
522 |
data_callback.setup([data_feedback_radio, data_further_details_text, data_full_file_name_textbox], feedback_logs_folder)
|
523 |
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, data_full_file_name_textbox], None, preprocess=False).\
|
524 |
+
success(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
525 |
|
526 |
# Log processing time/token usage when making a query
|
527 |
usage_callback = CSVLogger_custom(dataset_file_name=log_file_name)
|
528 |
usage_callback.setup([session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], usage_logs_folder)
|
529 |
latest_file_completed_text.change(lambda *args: usage_callback.flag(list(args)), [session_hash_textbox, doc_file_name_no_extension_textbox, data_full_file_name_textbox, estimated_time_taken_number, textract_metadata_textbox, pii_identification_method_drop, comprehend_query_number], None, preprocess=False).\
|
530 |
+
success(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
531 |
|
532 |
# Get some environment variables and Launch the Gradio app
|
533 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
tools/file_conversion.py
CHANGED
@@ -70,7 +70,7 @@ def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_d
|
|
70 |
else:
|
71 |
# Convert PDF page to image
|
72 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
73 |
-
dpi=image_dpi, use_cropbox=
|
74 |
image = image_l[0]
|
75 |
image = image.convert("L")
|
76 |
image.save(out_path, format="PNG")
|
@@ -139,59 +139,6 @@ def convert_pdf_to_images(pdf_path: str, prepare_for_review:bool=False, page_min
|
|
139 |
return images
|
140 |
|
141 |
|
142 |
-
# def convert_pdf_to_images(pdf_path:str, page_min:int = 0, image_dpi:float = image_dpi, progress=Progress(track_tqdm=True)):
|
143 |
-
|
144 |
-
# print("pdf_path in convert_pdf_to_images:", pdf_path)
|
145 |
-
|
146 |
-
# # Get the number of pages in the PDF
|
147 |
-
# page_count = pdfinfo_from_path(pdf_path)['Pages']
|
148 |
-
# print("Number of pages in PDF: ", str(page_count))
|
149 |
-
|
150 |
-
# images = []
|
151 |
-
|
152 |
-
# # Open the PDF file
|
153 |
-
# #for page_num in progress.tqdm(range(0,page_count), total=page_count, unit="pages", desc="Converting pages"): range(page_min,page_count): #
|
154 |
-
# for page_num in tqdm(range(page_min,page_count), total=page_count, unit="pages", desc="Preparing pages"):
|
155 |
-
|
156 |
-
# #print("page_num in convert_pdf_to_images:", page_num)
|
157 |
-
|
158 |
-
# print("Converting page: ", str(page_num + 1))
|
159 |
-
|
160 |
-
# # Convert one page to image
|
161 |
-
# out_path = pdf_path + "_" + str(page_num) + ".png"
|
162 |
-
|
163 |
-
# # Ensure the directory exists
|
164 |
-
# os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
165 |
-
|
166 |
-
# # Check if the image already exists
|
167 |
-
# if os.path.exists(out_path):
|
168 |
-
# #print(f"Loading existing image from {out_path}.")
|
169 |
-
# image = Image.open(out_path) # Load the existing image
|
170 |
-
|
171 |
-
# else:
|
172 |
-
# image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1, dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
173 |
-
|
174 |
-
# image = image_l[0]
|
175 |
-
|
176 |
-
# # Convert to greyscale
|
177 |
-
# image = image.convert("L")
|
178 |
-
|
179 |
-
# image.save(out_path, format="PNG") # Save the new image
|
180 |
-
|
181 |
-
# # If no images are returned, break the loop
|
182 |
-
# if not image:
|
183 |
-
# print("Conversion of page", str(page_num), "to file failed.")
|
184 |
-
# break
|
185 |
-
|
186 |
-
# # print("Conversion of page", str(page_num), "to file succeeded.")
|
187 |
-
# # print("image:", image)
|
188 |
-
|
189 |
-
# images.append(out_path)
|
190 |
-
|
191 |
-
# print("PDF has been converted to images.")
|
192 |
-
# # print("Images:", images)
|
193 |
-
|
194 |
-
# return images
|
195 |
|
196 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
197 |
def process_file(file_path:str, prepare_for_review:bool=False):
|
@@ -304,71 +251,6 @@ def redact_single_box(pymupdf_page:Page, pymupdf_rect:Rect, img_annotation_box:d
|
|
304 |
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
305 |
shape.commit()
|
306 |
|
307 |
-
# def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
308 |
-
# '''
|
309 |
-
# Converts coordinates from pymupdf format to image coordinates,
|
310 |
-
# accounting for mediabox dimensions and offset.
|
311 |
-
# '''
|
312 |
-
# # Get rect dimensions
|
313 |
-
# rect = pymupdf_page.rect
|
314 |
-
# rect_width = rect.width
|
315 |
-
# rect_height = rect.height
|
316 |
-
|
317 |
-
# # Get mediabox dimensions and position
|
318 |
-
# mediabox = pymupdf_page.mediabox
|
319 |
-
# mediabox_width = mediabox.width
|
320 |
-
# mediabox_height = mediabox.height
|
321 |
-
|
322 |
-
# # Get target image dimensions
|
323 |
-
# image_page_width, image_page_height = image.size
|
324 |
-
|
325 |
-
# # Calculate scaling factors
|
326 |
-
# image_to_mediabox_x_scale = image_page_width / mediabox_width
|
327 |
-
# image_to_mediabox_y_scale = image_page_height / mediabox_height
|
328 |
-
|
329 |
-
# image_to_rect_scale_width = image_page_width / rect_width
|
330 |
-
# image_to_rect_scale_height = image_page_height / rect_height
|
331 |
-
|
332 |
-
# # Adjust for offsets (difference in position between mediabox and rect)
|
333 |
-
# x_offset = rect.x0 - mediabox.x0 # Difference in x position
|
334 |
-
# y_offset = rect.y0 - mediabox.y0 # Difference in y position
|
335 |
-
|
336 |
-
# print("x_offset:", x_offset)
|
337 |
-
# print("y_offset:", y_offset)
|
338 |
-
|
339 |
-
# # Adjust coordinates:
|
340 |
-
# # Apply scaling to match image dimensions
|
341 |
-
# x1_image = x1 * image_to_mediabox_x_scale
|
342 |
-
# x2_image = x2 * image_to_mediabox_x_scale
|
343 |
-
# y1_image = y1 * image_to_mediabox_y_scale
|
344 |
-
# y2_image = y2 * image_to_mediabox_y_scale
|
345 |
-
|
346 |
-
# # Correct for difference in rect and mediabox size
|
347 |
-
# if mediabox_width != rect_width:
|
348 |
-
|
349 |
-
# mediabox_to_rect_x_scale = mediabox_width / rect_width
|
350 |
-
# mediabox_to_rect_y_scale = mediabox_height / rect_height
|
351 |
-
|
352 |
-
# x1_image *= mediabox_to_rect_x_scale
|
353 |
-
# x2_image *= mediabox_to_rect_x_scale
|
354 |
-
# y1_image *= mediabox_to_rect_y_scale
|
355 |
-
# y2_image *= mediabox_to_rect_y_scale
|
356 |
-
|
357 |
-
# print("mediabox_to_rect_x_scale:", mediabox_to_rect_x_scale)
|
358 |
-
# #print("mediabox_to_rect_y_scale:", mediabox_to_rect_y_scale)
|
359 |
-
|
360 |
-
# print("image_to_mediabox_x_scale:", image_to_mediabox_x_scale)
|
361 |
-
# #print("image_to_mediabox_y_scale:", image_to_mediabox_y_scale)
|
362 |
-
|
363 |
-
# mediabox_rect_x_diff = (mediabox_width - rect_width) * 2
|
364 |
-
# mediabox_rect_y_diff = (mediabox_height - rect_height) * 2
|
365 |
-
|
366 |
-
# x1_image -= mediabox_rect_x_diff
|
367 |
-
# x2_image -= mediabox_rect_x_diff
|
368 |
-
# y1_image += mediabox_rect_y_diff
|
369 |
-
# y2_image += mediabox_rect_y_diff
|
370 |
-
|
371 |
-
# return x1_image, y1_image, x2_image, y2_image
|
372 |
|
373 |
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
374 |
'''
|
@@ -434,8 +316,6 @@ def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
|
434 |
|
435 |
return x1_image, y1_image, x2_image, y2_image
|
436 |
|
437 |
-
|
438 |
-
|
439 |
def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
|
440 |
# Small border to page that remains white
|
441 |
border = 5
|
@@ -498,6 +378,7 @@ def prepare_image_or_pdf(
|
|
498 |
|
499 |
tic = time.perf_counter()
|
500 |
json_from_csv = False
|
|
|
501 |
|
502 |
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
503 |
in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
|
@@ -586,14 +467,19 @@ def prepare_image_or_pdf(
|
|
586 |
if not file_path:
|
587 |
out_message = "Please select a file."
|
588 |
print(out_message)
|
589 |
-
|
590 |
-
|
591 |
file_extension = os.path.splitext(file_path)[1].lower()
|
592 |
|
593 |
# If a pdf, load as a pymupdf document
|
594 |
if is_pdf(file_path):
|
595 |
pymupdf_doc = pymupdf.open(file_path)
|
596 |
|
|
|
|
|
|
|
|
|
|
|
597 |
converted_file_path = file_path
|
598 |
image_file_paths = process_file(file_path, prepare_for_review)
|
599 |
|
@@ -737,13 +623,13 @@ def prepare_image_or_pdf(
|
|
737 |
if is_pdf_or_image(file_path) == False:
|
738 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
739 |
print(out_message)
|
740 |
-
|
741 |
|
742 |
elif in_redact_method == text_ocr_option:
|
743 |
if is_pdf(file_path) == False:
|
744 |
out_message = "Please upload a PDF file for text analysis."
|
745 |
print(out_message)
|
746 |
-
|
747 |
|
748 |
|
749 |
converted_file_paths.append(converted_file_path)
|
@@ -759,7 +645,7 @@ def prepare_image_or_pdf(
|
|
759 |
|
760 |
number_of_pages = len(image_file_paths)
|
761 |
|
762 |
-
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
763 |
|
764 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
765 |
file_path_without_ext = get_file_name_without_type(in_file_path)
|
|
|
70 |
else:
|
71 |
# Convert PDF page to image
|
72 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
73 |
+
dpi=image_dpi, use_cropbox=False, use_pdftocairo=False)
|
74 |
image = image_l[0]
|
75 |
image = image.convert("L")
|
76 |
image.save(out_path, format="PNG")
|
|
|
139 |
return images
|
140 |
|
141 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
# Function to take in a file path, decide if it is an image or pdf, then process appropriately.
|
144 |
def process_file(file_path:str, prepare_for_review:bool=False):
|
|
|
251 |
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
252 |
shape.commit()
|
253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
|
255 |
def convert_pymupdf_to_image_coords(pymupdf_page, x1, y1, x2, y2, image: Image):
|
256 |
'''
|
|
|
316 |
|
317 |
return x1_image, y1_image, x2_image, y2_image
|
318 |
|
|
|
|
|
319 |
def redact_whole_pymupdf_page(rect_height, rect_width, image, page, custom_colours, border = 5):
|
320 |
# Small border to page that remains white
|
321 |
border = 5
|
|
|
378 |
|
379 |
tic = time.perf_counter()
|
380 |
json_from_csv = False
|
381 |
+
original_cropboxes = [] # Store original CropBox values
|
382 |
|
383 |
if isinstance(in_fully_redacted_list, pd.DataFrame):
|
384 |
in_fully_redacted_list = in_fully_redacted_list.iloc[:,0].tolist()
|
|
|
467 |
if not file_path:
|
468 |
out_message = "Please select a file."
|
469 |
print(out_message)
|
470 |
+
raise Exception(out_message)
|
471 |
+
|
472 |
file_extension = os.path.splitext(file_path)[1].lower()
|
473 |
|
474 |
# If a pdf, load as a pymupdf document
|
475 |
if is_pdf(file_path):
|
476 |
pymupdf_doc = pymupdf.open(file_path)
|
477 |
|
478 |
+
# Load cropbox dimensions to use later
|
479 |
+
|
480 |
+
for page in pymupdf_doc:
|
481 |
+
original_cropboxes.append(page.cropbox) # Save original CropBox
|
482 |
+
|
483 |
converted_file_path = file_path
|
484 |
image_file_paths = process_file(file_path, prepare_for_review)
|
485 |
|
|
|
623 |
if is_pdf_or_image(file_path) == False:
|
624 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
625 |
print(out_message)
|
626 |
+
raise Exception(out_message)
|
627 |
|
628 |
elif in_redact_method == text_ocr_option:
|
629 |
if is_pdf(file_path) == False:
|
630 |
out_message = "Please upload a PDF file for text analysis."
|
631 |
print(out_message)
|
632 |
+
raise Exception(out_message)
|
633 |
|
634 |
|
635 |
converted_file_paths.append(converted_file_path)
|
|
|
645 |
|
646 |
number_of_pages = len(image_file_paths)
|
647 |
|
648 |
+
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv, original_cropboxes
|
649 |
|
650 |
def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str], image_dpi:float=image_dpi):
|
651 |
file_path_without_ext = get_file_name_without_type(in_file_path)
|
tools/file_redaction.py
CHANGED
@@ -102,6 +102,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
102 |
annotate_max_pages:int=1,
|
103 |
review_file_state=[],
|
104 |
output_folder:str=output_folder,
|
|
|
105 |
progress=gr.Progress(track_tqdm=True)):
|
106 |
'''
|
107 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
@@ -140,6 +141,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
140 |
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
141 |
- annotate_max_pages (int, optional): Maximum page value for the annotation object
|
142 |
- output_folder (str, optional): Output folder for results.
|
|
|
143 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
144 |
|
145 |
The function returns a redacted document along with processing logs.
|
@@ -150,10 +152,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
150 |
|
151 |
# If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
|
152 |
if not pymupdf_doc:
|
153 |
-
print("Prepared PDF file not found,
|
154 |
-
out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages)
|
155 |
-
|
156 |
-
annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
157 |
|
158 |
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
159 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
@@ -183,7 +183,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
183 |
out_file_paths = []
|
184 |
estimate_total_processing_time = 0
|
185 |
estimated_time_taken_state = 0
|
186 |
-
|
187 |
# If not the first time around, and the current page loop has been set to a huge number (been through all pages), reset current page to 0
|
188 |
elif (first_loop_state == False) & (current_loop_page == 999):
|
189 |
current_loop_page = 0
|
@@ -200,12 +199,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
200 |
else:
|
201 |
number_of_files = len(file_paths)
|
202 |
|
203 |
-
# If we have already redacted the last file, return the input out_message and file list to the relevant
|
204 |
if latest_file_completed >= number_of_files:
|
205 |
|
206 |
print("Completed last file")
|
207 |
-
# Set to a very high number so as not to mix up with subsequent file processing by the user
|
208 |
-
# latest_file_completed = 99
|
209 |
current_loop_page = 0
|
210 |
|
211 |
if isinstance(out_message, list):
|
@@ -224,7 +221,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
224 |
|
225 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
226 |
|
227 |
-
# If we have reached the last page, return message
|
228 |
if current_loop_page >= number_of_pages:
|
229 |
print("Reached last page of document:", current_loop_page)
|
230 |
|
@@ -273,7 +270,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
273 |
comprehend_client = ""
|
274 |
out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
|
275 |
print(out_message)
|
276 |
-
|
|
|
277 |
else:
|
278 |
comprehend_client = ""
|
279 |
|
@@ -296,7 +294,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
296 |
textract_client = ""
|
297 |
out_message = "Cannot connect to AWS Textract. Please provide access keys under Textract settings on the Redaction settings tab,choose another text extraction method."
|
298 |
print(out_message)
|
299 |
-
|
|
|
300 |
else:
|
301 |
textract_client = ""
|
302 |
|
@@ -336,15 +335,14 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
336 |
else:
|
337 |
out_message = "No file selected"
|
338 |
print(out_message)
|
339 |
-
|
340 |
-
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
341 |
|
342 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
343 |
|
344 |
#Analyse and redact image-based pdf or image
|
345 |
if is_pdf_or_image(file_path) == False:
|
346 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
347 |
-
|
348 |
|
349 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
350 |
|
@@ -383,12 +381,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
383 |
all_request_metadata.append(new_request_metadata)
|
384 |
|
385 |
elif in_redact_method == text_ocr_option:
|
386 |
-
|
387 |
-
#log_files_output_paths = []
|
388 |
|
389 |
if is_pdf(file_path) == False:
|
390 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
391 |
-
|
392 |
|
393 |
# Analyse text-based pdf
|
394 |
print('Redacting file as text-based PDF')
|
@@ -418,7 +414,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
418 |
else:
|
419 |
out_message = "No redaction method selected"
|
420 |
print(out_message)
|
421 |
-
|
422 |
|
423 |
# If at last page, save to file
|
424 |
if current_loop_page >= number_of_pages:
|
@@ -434,8 +430,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
434 |
if is_pdf(file_path) == False:
|
435 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
|
436 |
pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
437 |
-
out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
|
438 |
-
|
439 |
else:
|
440 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
441 |
pymupdf_doc.save(out_redacted_pdf_file_path)
|
@@ -678,12 +673,13 @@ def move_page_info(file_path: str) -> str:
|
|
678 |
|
679 |
return new_file_path
|
680 |
|
681 |
-
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_coords:bool=True):
|
682 |
|
683 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
684 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
685 |
rect_height = page.rect.height
|
686 |
-
rect_width = page.rect.width
|
|
|
687 |
|
688 |
pymupdf_x1 = None
|
689 |
pymupdf_x2 = None
|
@@ -801,6 +797,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
801 |
}
|
802 |
|
803 |
page.apply_redactions(images=0, graphics=0)
|
|
|
804 |
page.clean_contents()
|
805 |
|
806 |
return page, out_annotation_boxes
|
@@ -1003,9 +1000,10 @@ def redact_image_pdf(file_path:str,
|
|
1003 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
1004 |
|
1005 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
1006 |
-
|
|
|
|
|
1007 |
|
1008 |
-
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1009 |
|
1010 |
if analysis_type == textract_option and textract_client == "":
|
1011 |
print("Connection to AWS Textract service unsuccessful.")
|
@@ -1057,6 +1055,8 @@ def redact_image_pdf(file_path:str,
|
|
1057 |
|
1058 |
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1059 |
|
|
|
|
|
1060 |
for page_no in progress_bar:
|
1061 |
|
1062 |
handwriting_or_signature_boxes = []
|
@@ -1076,6 +1076,9 @@ def redact_image_pdf(file_path:str,
|
|
1076 |
|
1077 |
image_annotations = {"image": image, "boxes": []}
|
1078 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
|
|
|
|
|
|
1079 |
|
1080 |
if page_no >= page_min and page_no < page_max:
|
1081 |
|
@@ -1219,7 +1222,7 @@ def redact_image_pdf(file_path:str,
|
|
1219 |
else: redact_whole_page = False
|
1220 |
else: redact_whole_page = False
|
1221 |
|
1222 |
-
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image, redact_whole_page=redact_whole_page)
|
1223 |
|
1224 |
# Convert decision process to table
|
1225 |
decision_process_table = pd.DataFrame([{
|
@@ -1596,6 +1599,8 @@ def redact_text_pdf(
|
|
1596 |
if current_loop_page == 0: page_loop_start = 0
|
1597 |
else: page_loop_start = current_loop_page
|
1598 |
|
|
|
|
|
1599 |
progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1600 |
|
1601 |
#for page_no in range(0, number_of_pages):
|
@@ -1615,6 +1620,9 @@ def redact_text_pdf(
|
|
1615 |
image_annotations = {"image": image, "boxes": []}
|
1616 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
1617 |
|
|
|
|
|
|
|
1618 |
if page_min <= page_no < page_max:
|
1619 |
|
1620 |
if isinstance(image, str):
|
@@ -1701,15 +1709,15 @@ def redact_text_pdf(
|
|
1701 |
else: redact_whole_page = False
|
1702 |
else: redact_whole_page = False
|
1703 |
|
1704 |
-
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False)
|
1705 |
|
1706 |
reported_page_no = page_no + 1
|
1707 |
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1708 |
|
1709 |
# Join extracted text outputs for all lines together
|
1710 |
if not page_text_ocr_outputs.empty:
|
1711 |
-
|
1712 |
-
|
1713 |
|
1714 |
# Write logs
|
1715 |
# Create decision process table
|
|
|
102 |
annotate_max_pages:int=1,
|
103 |
review_file_state=[],
|
104 |
output_folder:str=output_folder,
|
105 |
+
document_cropboxes:List=[],
|
106 |
progress=gr.Progress(track_tqdm=True)):
|
107 |
'''
|
108 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
|
|
141 |
- aws_secret_key_textbox (str, optional): AWS secret key for account with Textract and Comprehend permissions.
|
142 |
- annotate_max_pages (int, optional): Maximum page value for the annotation object
|
143 |
- output_folder (str, optional): Output folder for results.
|
144 |
+
- document_cropboxes (List, optional): List of document cropboxes for the PDF.
|
145 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
146 |
|
147 |
The function returns a redacted document along with processing logs.
|
|
|
152 |
|
153 |
# If there are no prepared PDF file paths, it is most likely that the prepare_image_or_pdf function has not been run. So do it here to get the outputs you need
|
154 |
if not pymupdf_doc:
|
155 |
+
print("Prepared PDF file not found, loading from file")
|
156 |
+
out_message, prepared_pdf_file_paths, prepared_pdf_image_paths, annotate_max_pages, annotate_max_pages, pymupdf_doc, annotations_all_pages, review_file_state, document_cropboxes = prepare_image_or_pdf(file_paths, in_redact_method, latest_file_completed, out_message, first_loop_state, annotate_max_pages, annotations_all_pages, document_cropboxes)
|
|
|
|
|
157 |
|
158 |
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
159 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
|
|
183 |
out_file_paths = []
|
184 |
estimate_total_processing_time = 0
|
185 |
estimated_time_taken_state = 0
|
|
|
186 |
# If not the first time around, and the current page loop has been set to a huge number (been through all pages), reset current page to 0
|
187 |
elif (first_loop_state == False) & (current_loop_page == 999):
|
188 |
current_loop_page = 0
|
|
|
199 |
else:
|
200 |
number_of_files = len(file_paths)
|
201 |
|
202 |
+
# If we have already redacted the last file, return the input out_message and file list to the relevant outputs
|
203 |
if latest_file_completed >= number_of_files:
|
204 |
|
205 |
print("Completed last file")
|
|
|
|
|
206 |
current_loop_page = 0
|
207 |
|
208 |
if isinstance(out_message, list):
|
|
|
221 |
|
222 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages, gr.Number(value=current_loop_page,precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, prepared_pdf_image_paths, review_file_state
|
223 |
|
224 |
+
# If we have reached the last page, return message and outputs
|
225 |
if current_loop_page >= number_of_pages:
|
226 |
print("Reached last page of document:", current_loop_page)
|
227 |
|
|
|
270 |
comprehend_client = ""
|
271 |
out_message = "Cannot connect to AWS Comprehend service. Please provide access keys under Textract settings on the Redaction settings tab, or choose another PII identification method."
|
272 |
print(out_message)
|
273 |
+
raise Exception(out_message)
|
274 |
+
|
275 |
else:
|
276 |
comprehend_client = ""
|
277 |
|
|
|
294 |
textract_client = ""
|
295 |
out_message = "Cannot connect to AWS Textract. Please provide access keys under Textract settings on the Redaction settings tab,choose another text extraction method."
|
296 |
print(out_message)
|
297 |
+
raise Exception(out_message)
|
298 |
+
|
299 |
else:
|
300 |
textract_client = ""
|
301 |
|
|
|
335 |
else:
|
336 |
out_message = "No file selected"
|
337 |
print(out_message)
|
338 |
+
raise Exception(out_message)
|
|
|
339 |
|
340 |
if in_redact_method == tesseract_ocr_option or in_redact_method == textract_option:
|
341 |
|
342 |
#Analyse and redact image-based pdf or image
|
343 |
if is_pdf_or_image(file_path) == False:
|
344 |
out_message = "Please upload a PDF file or image file (JPG, PNG) for image analysis."
|
345 |
+
raise Exception(out_message)
|
346 |
|
347 |
print("Redacting file " + pdf_file_name_with_ext + " as an image-based file")
|
348 |
|
|
|
381 |
all_request_metadata.append(new_request_metadata)
|
382 |
|
383 |
elif in_redact_method == text_ocr_option:
|
|
|
|
|
384 |
|
385 |
if is_pdf(file_path) == False:
|
386 |
out_message = "Please upload a PDF file for text analysis. If you have an image, select 'Image analysis'."
|
387 |
+
raise Exception(out_message)
|
388 |
|
389 |
# Analyse text-based pdf
|
390 |
print('Redacting file as text-based PDF')
|
|
|
414 |
else:
|
415 |
out_message = "No redaction method selected"
|
416 |
print(out_message)
|
417 |
+
raise Exception(out_message)
|
418 |
|
419 |
# If at last page, save to file
|
420 |
if current_loop_page >= number_of_pages:
|
|
|
430 |
if is_pdf(file_path) == False:
|
431 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted_as_pdf.pdf"
|
432 |
pymupdf_doc[-1].save(out_redacted_pdf_file_path, "PDF" ,resolution=image_dpi, save_all=False)#, append_images=pymupdf_doc[:1])
|
433 |
+
out_review_file_path = output_folder + pdf_file_name_without_ext + '_review_file.csv'
|
|
|
434 |
else:
|
435 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
436 |
pymupdf_doc.save(out_redacted_pdf_file_path)
|
|
|
673 |
|
674 |
return new_file_path
|
675 |
|
676 |
+
def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_coords:bool=True, original_cropbox=[]):
|
677 |
|
678 |
mediabox_height = page.mediabox[3] - page.mediabox[1]
|
679 |
mediabox_width = page.mediabox[2] - page.mediabox[0]
|
680 |
rect_height = page.rect.height
|
681 |
+
rect_width = page.rect.width
|
682 |
+
|
683 |
|
684 |
pymupdf_x1 = None
|
685 |
pymupdf_x2 = None
|
|
|
797 |
}
|
798 |
|
799 |
page.apply_redactions(images=0, graphics=0)
|
800 |
+
page.set_cropbox(original_cropbox) # Set CropBox to original size
|
801 |
page.clean_contents()
|
802 |
|
803 |
return page, out_annotation_boxes
|
|
|
1000 |
image_analyser = CustomImageAnalyzerEngine(nlp_analyser)
|
1001 |
|
1002 |
if pii_identification_method == "AWS Comprehend" and comprehend_client == "":
|
1003 |
+
out_message = "Connection to AWS Comprehend service unsuccessful."
|
1004 |
+
print(out_message)
|
1005 |
+
raise Exception(out_message)
|
1006 |
|
|
|
1007 |
|
1008 |
if analysis_type == textract_option and textract_client == "":
|
1009 |
print("Connection to AWS Textract service unsuccessful.")
|
|
|
1055 |
|
1056 |
progress_bar = tqdm(range(page_loop_start, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1057 |
|
1058 |
+
original_cropboxes = []
|
1059 |
+
|
1060 |
for page_no in progress_bar:
|
1061 |
|
1062 |
handwriting_or_signature_boxes = []
|
|
|
1076 |
|
1077 |
image_annotations = {"image": image, "boxes": []}
|
1078 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
1079 |
+
|
1080 |
+
original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
|
1081 |
+
pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
|
1082 |
|
1083 |
if page_no >= page_min and page_no < page_max:
|
1084 |
|
|
|
1222 |
else: redact_whole_page = False
|
1223 |
else: redact_whole_page = False
|
1224 |
|
1225 |
+
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, merged_redaction_bboxes, image, redact_whole_page=redact_whole_page, original_cropbox=original_cropboxes[-1])
|
1226 |
|
1227 |
# Convert decision process to table
|
1228 |
decision_process_table = pd.DataFrame([{
|
|
|
1599 |
if current_loop_page == 0: page_loop_start = 0
|
1600 |
else: page_loop_start = current_loop_page
|
1601 |
|
1602 |
+
original_cropboxes = []
|
1603 |
+
|
1604 |
progress_bar = tqdm(range(current_loop_page, number_of_pages), unit="pages remaining", desc="Redacting pages")
|
1605 |
|
1606 |
#for page_no in range(0, number_of_pages):
|
|
|
1620 |
image_annotations = {"image": image, "boxes": []}
|
1621 |
pymupdf_page = pymupdf_doc.load_page(page_no)
|
1622 |
|
1623 |
+
original_cropboxes.append(pymupdf_page.cropbox) # Save original CropBox
|
1624 |
+
pymupdf_page.set_cropbox(pymupdf_page.mediabox) # Set CropBox to MediaBox
|
1625 |
+
|
1626 |
if page_min <= page_no < page_max:
|
1627 |
|
1628 |
if isinstance(image, str):
|
|
|
1709 |
else: redact_whole_page = False
|
1710 |
else: redact_whole_page = False
|
1711 |
|
1712 |
+
pymupdf_page, image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_annotations_on_page, image, redact_whole_page=redact_whole_page, convert_coords=False, original_cropbox=original_cropboxes[-1])
|
1713 |
|
1714 |
reported_page_no = page_no + 1
|
1715 |
print("For page number:", reported_page_no, "there are", len(image_annotations["boxes"]), "annotations")
|
1716 |
|
1717 |
# Join extracted text outputs for all lines together
|
1718 |
if not page_text_ocr_outputs.empty:
|
1719 |
+
page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
1720 |
+
all_line_level_ocr_results_df = pd.concat([all_line_level_ocr_results_df, page_text_ocr_outputs])
|
1721 |
|
1722 |
# Write logs
|
1723 |
# Create decision process table
|
tools/helper_functions.py
CHANGED
@@ -60,7 +60,7 @@ def reset_state_vars():
|
|
60 |
show_share_button=False,
|
61 |
show_remove_button=False,
|
62 |
interactive=False
|
63 |
-
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
64 |
|
65 |
def reset_review_vars():
|
66 |
return [], pd.DataFrame(), pd.DataFrame()
|
|
|
60 |
show_share_button=False,
|
61 |
show_remove_button=False,
|
62 |
interactive=False
|
63 |
+
), [], [], [], pd.DataFrame(), pd.DataFrame(), []
|
64 |
|
65 |
def reset_review_vars():
|
66 |
return [], pd.DataFrame(), pd.DataFrame()
|
tools/redaction_review.py
CHANGED
@@ -12,8 +12,9 @@ from tools.helper_functions import get_file_name_without_type, output_folder, de
|
|
12 |
from tools.file_redaction import redact_page_with_pymupdf
|
13 |
import json
|
14 |
import os
|
|
|
15 |
import pymupdf
|
16 |
-
from fitz import Document
|
17 |
from PIL import ImageDraw, Image
|
18 |
from collections import defaultdict
|
19 |
|
@@ -431,7 +432,7 @@ def convert_image_coords_to_adobe(pdf_page_width:float, pdf_page_height:float, i
|
|
431 |
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
432 |
|
433 |
|
434 |
-
def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str]):
|
435 |
'''
|
436 |
Create an xfdf file from a review csv file and a pdf
|
437 |
'''
|
@@ -451,8 +452,23 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
|
|
451 |
|
452 |
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
453 |
|
454 |
-
|
455 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
456 |
|
457 |
image = image_paths[page_python_format]
|
458 |
|
@@ -535,7 +551,7 @@ def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc, image_paths:List[str
|
|
535 |
|
536 |
return xml_str
|
537 |
|
538 |
-
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder):
|
539 |
'''
|
540 |
Load in files to convert a review file into an Adobe comment file format
|
541 |
'''
|
@@ -572,7 +588,7 @@ def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], ou
|
|
572 |
|
573 |
df.fillna('', inplace=True) # Replace NaN with an empty string
|
574 |
|
575 |
-
xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths)
|
576 |
|
577 |
output_path = output_folder + file_path_name + "_adobe.xfdf"
|
578 |
|
|
|
12 |
from tools.file_redaction import redact_page_with_pymupdf
|
13 |
import json
|
14 |
import os
|
15 |
+
import re
|
16 |
import pymupdf
|
17 |
+
from fitz import Document, Rect
|
18 |
from PIL import ImageDraw, Image
|
19 |
from collections import defaultdict
|
20 |
|
|
|
432 |
return pdf_x1, pdf_y1, pdf_x2, pdf_y2
|
433 |
|
434 |
|
435 |
+
def create_xfdf(df:pd.DataFrame, pdf_path:str, pymupdf_doc:object, image_paths:List[str], document_cropboxes:List=[]):
|
436 |
'''
|
437 |
Create an xfdf file from a review csv file and a pdf
|
438 |
'''
|
|
|
452 |
|
453 |
pymupdf_page = pymupdf_doc.load_page(page_python_format)
|
454 |
|
455 |
+
# Load cropbox sizes
|
456 |
+
if document_cropboxes:
|
457 |
+
print("Document cropboxes:", document_cropboxes)
|
458 |
+
|
459 |
+
# Extract numbers safely using regex
|
460 |
+
match = re.findall(r"[-+]?\d*\.\d+|\d+", document_cropboxes[page_python_format])
|
461 |
+
|
462 |
+
if match and len(match) == 4:
|
463 |
+
rect_values = list(map(float, match)) # Convert extracted strings to floats
|
464 |
+
pymupdf_page.set_cropbox(Rect(*rect_values))
|
465 |
+
else:
|
466 |
+
raise ValueError(f"Invalid cropbox format: {document_cropboxes[page_python_format]}")
|
467 |
+
else:
|
468 |
+
print("Document cropboxes not found.")
|
469 |
+
|
470 |
+
pdf_page_height = pymupdf_page.mediabox.height
|
471 |
+
pdf_page_width = pymupdf_page.mediabox.width
|
472 |
|
473 |
image = image_paths[page_python_format]
|
474 |
|
|
|
551 |
|
552 |
return xml_str
|
553 |
|
554 |
+
def convert_df_to_xfdf(input_files:List[str], pdf_doc, image_paths:List[str], output_folder:str = output_folder, document_cropboxes:List=[]):
|
555 |
'''
|
556 |
Load in files to convert a review file into an Adobe comment file format
|
557 |
'''
|
|
|
588 |
|
589 |
df.fillna('', inplace=True) # Replace NaN with an empty string
|
590 |
|
591 |
+
xfdf_content = create_xfdf(df, pdf_name, pdf_doc, image_paths, document_cropboxes)
|
592 |
|
593 |
output_path = output_folder + file_path_name + "_adobe.xfdf"
|
594 |
|