Merge pull request #25 from seanpedrick-case/dev
Browse filesPDF compression removed by default, some new config options, minor formatting and layout changes
- README.md +1 -1
- app.py +14 -14
- pyproject.toml +2 -2
- requirements.txt +1 -1
- tools/config.py +53 -18
- tools/file_conversion.py +11 -1
- tools/file_redaction.py +30 -21
- tools/redaction_review.py +5 -4
- tools/textract_batch_call.py +21 -12
README.md
CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
-
version: 0.6.
|
14 |
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
|
|
10 |
---
|
11 |
# Document redaction
|
12 |
|
13 |
+
version: 0.6.5
|
14 |
|
15 |
Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
|
16 |
|
app.py
CHANGED
@@ -219,8 +219,8 @@ with app:
|
|
219 |
check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
|
220 |
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
|
221 |
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
222 |
-
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','
|
223 |
-
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','
|
224 |
is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
|
225 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
226 |
job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
|
@@ -287,15 +287,16 @@ with app:
|
|
287 |
send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract API call", variant="primary", visible=True)
|
288 |
with gr.Row(equal_height=False):
|
289 |
with gr.Column(scale=2):
|
290 |
-
textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(
|
291 |
with gr.Column(scale=1):
|
292 |
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
|
293 |
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
294 |
-
with gr.Row():
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
|
|
299 |
|
300 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
301 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
@@ -534,13 +535,11 @@ with app:
|
|
534 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
535 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
536 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
537 |
-
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children], api_name="redact_doc")
|
538 |
-
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
539 |
|
540 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
541 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
542 |
-
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
|
543 |
-
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
544 |
|
545 |
# If a file has been completed, the function will continue onto the next document
|
546 |
latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
@@ -574,7 +573,8 @@ with app:
|
|
574 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
575 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
576 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
577 |
-
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
|
|
|
578 |
|
579 |
###
|
580 |
# REVIEW PDF REDACTIONS
|
@@ -640,7 +640,7 @@ with app:
|
|
640 |
|
641 |
|
642 |
# Review OCR text buttom
|
643 |
-
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row]
|
644 |
reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
645 |
|
646 |
# Convert review file to xfdf Adobe format
|
|
|
219 |
check_state_of_textract_api_call_btn = gr.Button("Check state of Textract document job and download", variant="secondary", visible=False)
|
220 |
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=False)
|
221 |
job_type_dropdown = gr.Dropdown(value="document_text_detection", choices=["document_text_detection", "document_analysis"], label="Job type of Textract analysis job", allow_custom_value=False, visible=False)
|
222 |
+
textract_job_detail_df = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Previous job details", visible=False, type="pandas", wrap=True)
|
223 |
+
selected_job_id_row = gr.Dataframe(pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time']), label="Selected job id row", visible=False, type="pandas", wrap=True)
|
224 |
is_a_textract_api_call = gr.Checkbox(value=False, label="is_this_a_textract_api_call", visible=False)
|
225 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
226 |
job_input_textbox = gr.Textbox(value=TEXTRACT_JOBS_S3_INPUT_LOC, label="Textract call outputs", visible=False)
|
|
|
287 |
send_document_to_textract_api_btn = gr.Button("Analyse document with AWS Textract API call", variant="primary", visible=True)
|
288 |
with gr.Row(equal_height=False):
|
289 |
with gr.Column(scale=2):
|
290 |
+
textract_job_detail_df = gr.Dataframe(label="Previous job details", visible=True, type="pandas", wrap=True, interactive=True, row_count=(0, 'fixed'), col_count=(5,'fixed'), static_columns=[0,1,2,3,4], max_height=400)
|
291 |
with gr.Column(scale=1):
|
292 |
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
|
293 |
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
294 |
+
with gr.Row():
|
295 |
+
with gr.Column():
|
296 |
+
textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
|
297 |
+
with gr.Column():
|
298 |
+
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
|
299 |
+
convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results", variant="secondary", visible=True)
|
300 |
|
301 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
302 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
|
|
535 |
document_redact_btn.click(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
536 |
success(fn= enforce_cost_codes, inputs=[enforce_cost_code_textbox, cost_code_choice_drop, cost_code_dataframe_base]).\
|
537 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
538 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children], api_name="redact_doc")
|
|
|
539 |
|
540 |
# If the app has completed a batch of pages, it will rerun the redaction process until the end of all pages in the document
|
541 |
current_loop_page_number.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
542 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children])
|
|
|
543 |
|
544 |
# If a file has been completed, the function will continue onto the next document
|
545 |
latest_file_completed_text.change(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, text_extract_method_radio, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, second_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
|
|
573 |
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
574 |
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call, textract_query_number]).\
|
575 |
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children],
|
576 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_children]).\
|
577 |
+
success(fn=update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, page_min, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs=[annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
578 |
|
579 |
###
|
580 |
# REVIEW PDF REDACTIONS
|
|
|
640 |
|
641 |
|
642 |
# Review OCR text buttom
|
643 |
+
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row])
|
644 |
reset_all_ocr_results_btn.click(reset_ocr_base_dataframe, inputs=[all_line_level_ocr_results_df_base], outputs=[all_line_level_ocr_results_df])
|
645 |
|
646 |
# Convert review file to xfdf Adobe format
|
pyproject.toml
CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
-
version = "0.6.
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
@@ -23,7 +23,7 @@ dependencies = [
|
|
23 |
"spacy==3.8.4",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
-
"gradio==5.
|
27 |
"boto3==1.38.4",
|
28 |
"pyarrow==19.0.1",
|
29 |
"openpyxl==3.1.5",
|
|
|
4 |
|
5 |
[project]
|
6 |
name = "doc_redaction"
|
7 |
+
version = "0.6.5"
|
8 |
description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
|
9 |
readme = "README.md"
|
10 |
requires-python = ">=3.10"
|
|
|
23 |
"spacy==3.8.4",
|
24 |
# Direct URL dependency for spacy model
|
25 |
"en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
|
26 |
+
"gradio==5.29.0",
|
27 |
"boto3==1.38.4",
|
28 |
"pyarrow==19.0.1",
|
29 |
"openpyxl==3.1.5",
|
requirements.txt
CHANGED
@@ -10,7 +10,7 @@ pandas==2.2.3
|
|
10 |
scikit-learn==1.6.1
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
-
gradio==5.
|
14 |
boto3==1.38.4
|
15 |
pyarrow==19.0.1
|
16 |
openpyxl==3.1.5
|
|
|
10 |
scikit-learn==1.6.1
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
+
gradio==5.29.0
|
14 |
boto3==1.38.4
|
15 |
pyarrow==19.0.1
|
16 |
openpyxl==3.1.5
|
tools/config.py
CHANGED
@@ -59,6 +59,11 @@ def add_folder_to_path(folder_path: str):
|
|
59 |
else:
|
60 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
61 |
|
|
|
|
|
|
|
|
|
|
|
62 |
ensure_folder_exists("config/")
|
63 |
|
64 |
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
|
@@ -70,15 +75,12 @@ if APP_CONFIG_PATH:
|
|
70 |
load_dotenv(APP_CONFIG_PATH)
|
71 |
else: print("App config file not found at location:", APP_CONFIG_PATH)
|
72 |
|
73 |
-
# Report logging to console?
|
74 |
-
LOGGING = get_or_create_env_var('LOGGING', 'False')
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
79 |
|
80 |
###
|
81 |
-
# AWS
|
82 |
###
|
83 |
|
84 |
# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
|
@@ -108,27 +110,27 @@ if AWS_SECRET_KEY: print(f'AWS_SECRET_KEY found in environment variables')
|
|
108 |
|
109 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
110 |
|
111 |
-
|
112 |
-
|
113 |
# Custom headers e.g. if routing traffic through Cloudfront
|
114 |
# Retrieving or setting CUSTOM_HEADER
|
115 |
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
116 |
-
#if CUSTOM_HEADER: print(f'CUSTOM_HEADER found')
|
117 |
|
118 |
# Retrieving or setting CUSTOM_HEADER_VALUE
|
119 |
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
120 |
-
|
|
|
|
|
121 |
|
122 |
###
|
123 |
-
#
|
124 |
###
|
125 |
IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
|
126 |
LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
|
127 |
MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
|
128 |
|
129 |
###
|
130 |
-
# File I/O
|
131 |
###
|
|
|
132 |
SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
|
133 |
|
134 |
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
|
@@ -146,6 +148,11 @@ if OUTPUT_FOLDER == "TEMP" or INPUT_FOLDER == "TEMP":
|
|
146 |
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
|
147 |
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
|
148 |
|
|
|
|
|
|
|
|
|
|
|
149 |
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
150 |
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
151 |
|
@@ -190,8 +197,18 @@ DYNAMODB_FEEDBACK_LOG_HEADERS = get_or_create_env_var('DYNAMODB_FEEDBACK_LOG_HEA
|
|
190 |
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage')
|
191 |
DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '')
|
192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
###
|
194 |
-
# REDACTION
|
195 |
###
|
196 |
|
197 |
# Create Tesseract and Poppler folders if you have installed them locally
|
@@ -207,12 +224,19 @@ PAGE_BREAK_VALUE = get_or_create_env_var('PAGE_BREAK_VALUE', '99999')
|
|
207 |
|
208 |
MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
|
209 |
|
210 |
-
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
211 |
|
212 |
REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
|
213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
214 |
###
|
215 |
-
# APP RUN
|
216 |
###
|
217 |
|
218 |
TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
|
@@ -245,7 +269,12 @@ S3_ALLOW_LIST_PATH = get_or_create_env_var('S3_ALLOW_LIST_PATH', '') # default_a
|
|
245 |
if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
|
246 |
else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
|
247 |
|
248 |
-
|
|
|
|
|
|
|
|
|
|
|
249 |
|
250 |
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
|
251 |
|
@@ -265,7 +294,12 @@ ENFORCE_COST_CODES = get_or_create_env_var('ENFORCE_COST_CODES', 'False') # If y
|
|
265 |
|
266 |
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
|
267 |
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
269 |
|
270 |
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
|
271 |
|
@@ -281,5 +315,6 @@ TEXTRACT_JOBS_S3_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_LOC', 'output') #
|
|
281 |
|
282 |
TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
|
283 |
|
|
|
284 |
|
285 |
-
|
|
|
59 |
else:
|
60 |
print(f"Folder not found at {folder_path} - not added to PATH")
|
61 |
|
62 |
+
|
63 |
+
###
|
64 |
+
# LOAD CONFIG FROM ENV FILE
|
65 |
+
###
|
66 |
+
|
67 |
ensure_folder_exists("config/")
|
68 |
|
69 |
# If you have an aws_config env file in the config folder, you can load in app variables this way, e.g. 'config/app_config.env'
|
|
|
75 |
load_dotenv(APP_CONFIG_PATH)
|
76 |
else: print("App config file not found at location:", APP_CONFIG_PATH)
|
77 |
|
|
|
|
|
78 |
|
79 |
+
|
80 |
+
|
|
|
81 |
|
82 |
###
|
83 |
+
# AWS OPTIONS
|
84 |
###
|
85 |
|
86 |
# If you have an aws_config env file in the config folder, you can load in AWS keys this way, e.g. 'env/aws_config.env'
|
|
|
110 |
|
111 |
DOCUMENT_REDACTION_BUCKET = get_or_create_env_var('DOCUMENT_REDACTION_BUCKET', '')
|
112 |
|
|
|
|
|
113 |
# Custom headers e.g. if routing traffic through Cloudfront
|
114 |
# Retrieving or setting CUSTOM_HEADER
|
115 |
CUSTOM_HEADER = get_or_create_env_var('CUSTOM_HEADER', '')
|
|
|
116 |
|
117 |
# Retrieving or setting CUSTOM_HEADER_VALUE
|
118 |
CUSTOM_HEADER_VALUE = get_or_create_env_var('CUSTOM_HEADER_VALUE', '')
|
119 |
+
|
120 |
+
|
121 |
+
|
122 |
|
123 |
###
|
124 |
+
# Image options
|
125 |
###
|
126 |
IMAGES_DPI = get_or_create_env_var('IMAGES_DPI', '300.0')
|
127 |
LOAD_TRUNCATED_IMAGES = get_or_create_env_var('LOAD_TRUNCATED_IMAGES', 'True')
|
128 |
MAX_IMAGE_PIXELS = get_or_create_env_var('MAX_IMAGE_PIXELS', '') # Changed to None if blank in file_conversion.py
|
129 |
|
130 |
###
|
131 |
+
# File I/O options
|
132 |
###
|
133 |
+
|
134 |
SESSION_OUTPUT_FOLDER = get_or_create_env_var('SESSION_OUTPUT_FOLDER', 'False') # i.e. do you want your input and output folders saved within a subfolder based on session hash value within output/input folders
|
135 |
|
136 |
OUTPUT_FOLDER = get_or_create_env_var('GRADIO_OUTPUT_FOLDER', 'output/') # 'output/'
|
|
|
148 |
if OUTPUT_FOLDER == "TEMP": OUTPUT_FOLDER = temp_dir + "/"
|
149 |
if INPUT_FOLDER == "TEMP": INPUT_FOLDER = temp_dir + "/"
|
150 |
|
151 |
+
|
152 |
+
###
|
153 |
+
# LOGGING OPTIONS
|
154 |
+
###
|
155 |
+
|
156 |
# By default, logs are put into a subfolder of today's date and the host name of the instance running the app. This is to avoid at all possible the possibility of log files from one instance overwriting the logs of another instance on S3. If running the app on one system always, or just locally, it is not necessary to make the log folders so specific.
|
157 |
# Another way to address this issue would be to write logs to another type of storage, e.g. database such as dynamodb. I may look into this in future.
|
158 |
|
|
|
197 |
USAGE_LOG_DYNAMODB_TABLE_NAME = get_or_create_env_var('USAGE_LOG_DYNAMODB_TABLE_NAME', 'redaction_usage')
|
198 |
DYNAMODB_USAGE_LOG_HEADERS = get_or_create_env_var('DYNAMODB_USAGE_LOG_HEADERS', '')
|
199 |
|
200 |
+
# Report logging to console?
|
201 |
+
LOGGING = get_or_create_env_var('LOGGING', 'False')
|
202 |
+
|
203 |
+
if LOGGING == 'True':
|
204 |
+
# Configure logging
|
205 |
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
206 |
+
|
207 |
+
|
208 |
+
|
209 |
+
|
210 |
###
|
211 |
+
# REDACTION OPTIONS
|
212 |
###
|
213 |
|
214 |
# Create Tesseract and Poppler folders if you have installed them locally
|
|
|
224 |
|
225 |
MAX_TIME_VALUE = get_or_create_env_var('MAX_TIME_VALUE', '999999')
|
226 |
|
227 |
+
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "") # only "grey" is currently supported as a custom box colour
|
228 |
|
229 |
REDACTION_LANGUAGE = get_or_create_env_var("REDACTION_LANGUAGE", "en") # Currently only English is supported by the app
|
230 |
|
231 |
+
RETURN_PDF_END_OF_REDACTION = get_or_create_env_var("RETURN_PDF_END_OF_REDACTION", "True") # Return a redacted PDF at the end of the redaction task. Could be useful to set this to "False" if you want to ensure that the user always goes to the 'Review Redactions' tab before getting the final redacted PDF product.
|
232 |
+
|
233 |
+
COMPRESS_REDACTED_PDF = get_or_create_env_var("COMPRESS_REDACTED_PDF","False") # On low memory systems, the compression options in pymupdf can cause the app to crash if the PDF is longer than 500 pages or so. Setting this to False will save the PDF only with a basic cleaning option enabled
|
234 |
+
|
235 |
+
|
236 |
+
|
237 |
+
|
238 |
###
|
239 |
+
# APP RUN OPTIONS
|
240 |
###
|
241 |
|
242 |
TLDEXTRACT_CACHE = get_or_create_env_var('TLDEXTRACT_CACHE', 'tld/.tld_set_snapshot')
|
|
|
269 |
if ALLOW_LIST_PATH: OUTPUT_ALLOW_LIST_PATH = ALLOW_LIST_PATH
|
270 |
else: OUTPUT_ALLOW_LIST_PATH = 'config/default_allow_list.csv'
|
271 |
|
272 |
+
|
273 |
+
|
274 |
+
|
275 |
+
###
|
276 |
+
# COST CODE OPTIONS
|
277 |
+
###
|
278 |
|
279 |
SHOW_COSTS = get_or_create_env_var('SHOW_COSTS', 'False')
|
280 |
|
|
|
294 |
|
295 |
if ENFORCE_COST_CODES == 'True': GET_COST_CODES = 'True'
|
296 |
|
297 |
+
|
298 |
+
|
299 |
+
|
300 |
+
###
|
301 |
+
# WHOLE DOCUMENT API OPTIONS
|
302 |
+
###
|
303 |
|
304 |
SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS = get_or_create_env_var('SHOW_WHOLE_DOCUMENT_TEXTRACT_CALL_OPTIONS', 'False') # This feature not currently implemented
|
305 |
|
|
|
315 |
|
316 |
TEXTRACT_JOBS_S3_INPUT_LOC = get_or_create_env_var('TEXTRACT_JOBS_S3_INPUT_LOC', 'input') # Subfolder in the DOCUMENT_REDACTION_BUCKET where the Textract jobs are stored
|
317 |
|
318 |
+
TEXTRACT_JOBS_LOCAL_LOC = get_or_create_env_var('TEXTRACT_JOBS_LOCAL_LOC', 'output') # Local subfolder where the Textract jobs are stored
|
319 |
|
320 |
+
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = get_or_create_env_var('DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS', '7') # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
|
tools/file_conversion.py
CHANGED
@@ -27,7 +27,7 @@ IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
|
|
27 |
|
28 |
pd.set_option('future.no_silent_downcasting', True)
|
29 |
|
30 |
-
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR
|
31 |
from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
|
32 |
# from tools.aws_textract import load_and_convert_textract_json
|
33 |
|
@@ -35,6 +35,7 @@ image_dpi = float(IMAGES_DPI)
|
|
35 |
if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
|
36 |
else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
37 |
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
|
|
|
38 |
|
39 |
def is_pdf_or_image(filename):
|
40 |
"""
|
@@ -841,6 +842,15 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
|
|
841 |
|
842 |
return out_message, out_file_paths
|
843 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
844 |
def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
|
845 |
# Threshold for matching
|
846 |
threshold = 5
|
|
|
27 |
|
28 |
pd.set_option('future.no_silent_downcasting', True)
|
29 |
|
30 |
+
from tools.config import OUTPUT_FOLDER, INPUT_FOLDER, IMAGES_DPI, LOAD_TRUNCATED_IMAGES, MAX_IMAGE_PIXELS, CUSTOM_BOX_COLOUR, COMPRESS_REDACTED_PDF
|
31 |
from tools.helper_functions import get_file_name_without_type, tesseract_ocr_option, text_ocr_option, textract_option, read_file
|
32 |
# from tools.aws_textract import load_and_convert_textract_json
|
33 |
|
|
|
35 |
if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
|
36 |
else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
37 |
ImageFile.LOAD_TRUNCATED_IMAGES = LOAD_TRUNCATED_IMAGES.lower() == "true"
|
38 |
+
COMPRESS_REDACTED_PDF = COMPRESS_REDACTED_PDF.lower() == "true"
|
39 |
|
40 |
def is_pdf_or_image(filename):
|
41 |
"""
|
|
|
842 |
|
843 |
return out_message, out_file_paths
|
844 |
|
845 |
+
def save_pdf_with_or_without_compression(pymupdf_doc:object, out_redacted_pdf_file_path, COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF):
|
846 |
+
'''
|
847 |
+
Save a pymupdf document with basic cleaning or with full compression options. Can be useful for low memory systems to do minimal cleaning to avoid crashing with large PDFs.
|
848 |
+
'''
|
849 |
+
if COMPRESS_REDACTED_PDF == True:
|
850 |
+
pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
|
851 |
+
else:
|
852 |
+
pymupdf_doc.save(out_redacted_pdf_file_path, garbage=1, clean=True)
|
853 |
+
|
854 |
def join_values_within_threshold(df1:pd.DataFrame, df2:pd.DataFrame):
|
855 |
# Threshold for matching
|
856 |
threshold = 5
|
tools/file_redaction.py
CHANGED
@@ -19,9 +19,9 @@ import gradio as gr
|
|
19 |
from gradio import Progress
|
20 |
from collections import defaultdict # For efficient grouping
|
21 |
|
22 |
-
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
|
23 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes, recreate_page_line_level_ocr_results_with_page
|
24 |
-
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json
|
25 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
26 |
from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
|
27 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
@@ -31,6 +31,8 @@ if not MAX_IMAGE_PIXELS: Image.MAX_IMAGE_PIXELS = None
|
|
31 |
else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
32 |
image_dpi = float(IMAGES_DPI)
|
33 |
|
|
|
|
|
34 |
def bounding_boxes_overlap(box1, box2):
|
35 |
"""Check if two bounding boxes overlap."""
|
36 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
@@ -104,6 +106,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
104 |
all_page_line_level_ocr_results = [],
|
105 |
all_page_line_level_ocr_results_with_words = [],
|
106 |
prepare_images:bool=True,
|
|
|
107 |
progress=gr.Progress(track_tqdm=True)):
|
108 |
'''
|
109 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
@@ -155,6 +158,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
155 |
- all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
|
156 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
157 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
|
|
158 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
159 |
|
160 |
The function returns a redacted document along with processing logs.
|
@@ -232,6 +236,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
232 |
|
233 |
combined_out_message = re.sub(r'^\n+', '', combined_out_message).strip()
|
234 |
|
|
|
|
|
|
|
|
|
|
|
235 |
# Only send across review file if redaction has been done
|
236 |
if pii_identification_method != no_redaction_option:
|
237 |
|
@@ -529,26 +538,27 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
529 |
if latest_file_completed != len(file_paths_list):
|
530 |
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
531 |
|
532 |
-
|
533 |
|
534 |
# Save redacted file
|
535 |
if pii_identification_method != no_redaction_option:
|
536 |
-
if
|
537 |
-
|
538 |
-
|
539 |
-
if
|
540 |
-
|
541 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
542 |
else:
|
543 |
-
|
544 |
-
|
545 |
-
|
546 |
-
|
547 |
-
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
548 |
-
print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
549 |
-
pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
|
550 |
-
|
551 |
-
out_file_paths.append(out_redacted_pdf_file_path)
|
552 |
|
553 |
if not all_line_level_ocr_results_df.empty:
|
554 |
all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
|
@@ -564,11 +574,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
564 |
|
565 |
# Convert the gradio annotation boxes to relative coordinates
|
566 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
|
|
567 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
568 |
all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
|
569 |
-
|
570 |
all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
571 |
-
|
572 |
annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
|
573 |
annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(annotations_all_pages_divide)
|
574 |
|
@@ -934,7 +943,7 @@ def set_cropbox_safely(page, original_cropbox):
|
|
934 |
"""
|
935 |
mediabox = page.mediabox
|
936 |
if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
|
937 |
-
print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
|
938 |
page.set_cropbox(mediabox)
|
939 |
else:
|
940 |
page.set_cropbox(original_cropbox)
|
|
|
19 |
from gradio import Progress
|
20 |
from collections import defaultdict # For efficient grouping
|
21 |
|
22 |
+
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER, RETURN_PDF_END_OF_REDACTION
|
23 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes, recreate_page_line_level_ocr_results_with_page
|
24 |
+
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids, load_and_convert_ocr_results_with_words_json, save_pdf_with_or_without_compression
|
25 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
26 |
from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
|
27 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
|
|
31 |
else: Image.MAX_IMAGE_PIXELS = MAX_IMAGE_PIXELS
|
32 |
image_dpi = float(IMAGES_DPI)
|
33 |
|
34 |
+
RETURN_PDF_END_OF_REDACTION = RETURN_PDF_END_OF_REDACTION.lower() == "true"
|
35 |
+
|
36 |
def bounding_boxes_overlap(box1, box2):
|
37 |
"""Check if two bounding boxes overlap."""
|
38 |
return (box1[0] < box2[2] and box2[0] < box1[2] and
|
|
|
106 |
all_page_line_level_ocr_results = [],
|
107 |
all_page_line_level_ocr_results_with_words = [],
|
108 |
prepare_images:bool=True,
|
109 |
+
RETURN_PDF_END_OF_REDACTION:bool=RETURN_PDF_END_OF_REDACTION,
|
110 |
progress=gr.Progress(track_tqdm=True)):
|
111 |
'''
|
112 |
This function orchestrates the redaction process based on the specified method and parameters. It takes the following inputs:
|
|
|
158 |
- all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
|
159 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
160 |
- prepare_images (bool, optional): Boolean to determine whether to load images for the PDF.
|
161 |
+
- RETURN_PDF_END_OF_REDACTION (bool, optional): Boolean to determine whether to return a redacted PDF at the end of the redaction process.
|
162 |
- progress (gr.Progress, optional): A progress tracker for the redaction process. Defaults to a Progress object with track_tqdm set to True.
|
163 |
|
164 |
The function returns a redacted document along with processing logs.
|
|
|
236 |
|
237 |
combined_out_message = re.sub(r'^\n+', '', combined_out_message).strip()
|
238 |
|
239 |
+
end_message = "\n\nPlease review and modify the suggested redaction outputs on the 'Review redactions' tab of the app (you can find this under the introduction text at the top of the page)."
|
240 |
+
|
241 |
+
if not end_message in combined_out_message:
|
242 |
+
combined_out_message = combined_out_message + end_message
|
243 |
+
|
244 |
# Only send across review file if redaction has been done
|
245 |
if pii_identification_method != no_redaction_option:
|
246 |
|
|
|
538 |
if latest_file_completed != len(file_paths_list):
|
539 |
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
540 |
|
541 |
+
|
542 |
|
543 |
# Save redacted file
|
544 |
if pii_identification_method != no_redaction_option:
|
545 |
+
if RETURN_PDF_END_OF_REDACTION == True:
|
546 |
+
progress(0.9, "Saving redacted file")
|
547 |
+
|
548 |
+
if is_pdf(file_path) == False:
|
549 |
+
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.png"
|
550 |
+
# pymupdf_doc is an image list in this case
|
551 |
+
if isinstance(pymupdf_doc[-1], str):
|
552 |
+
img = Image.open(pymupdf_doc[-1])
|
553 |
+
# Otherwise could be an image object
|
554 |
+
else:
|
555 |
+
img = pymupdf_doc[-1]
|
556 |
+
img.save(out_redacted_pdf_file_path, "PNG" ,resolution=image_dpi)
|
557 |
else:
|
558 |
+
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
559 |
+
print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
560 |
+
save_pdf_with_or_without_compression(pymupdf_doc, out_redacted_pdf_file_path)
|
561 |
+
out_file_paths.append(out_redacted_pdf_file_path)
|
|
|
|
|
|
|
|
|
|
|
562 |
|
563 |
if not all_line_level_ocr_results_df.empty:
|
564 |
all_line_level_ocr_results_df = all_line_level_ocr_results_df[["page", "text", "left", "top", "width", "height"]]
|
|
|
574 |
|
575 |
# Convert the gradio annotation boxes to relative coordinates
|
576 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
577 |
+
progress(0.93, "Creating review file output")
|
578 |
page_sizes = page_sizes_df.to_dict(orient="records")
|
579 |
all_image_annotations_df = convert_annotation_data_to_dataframe(annotations_all_pages)
|
|
|
580 |
all_image_annotations_df = divide_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
|
|
581 |
annotations_all_pages_divide = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
|
582 |
annotations_all_pages_divide = remove_duplicate_images_with_blank_boxes(annotations_all_pages_divide)
|
583 |
|
|
|
943 |
"""
|
944 |
mediabox = page.mediabox
|
945 |
if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
|
946 |
+
#print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
|
947 |
page.set_cropbox(mediabox)
|
948 |
else:
|
949 |
page.set_cropbox(original_cropbox)
|
tools/redaction_review.py
CHANGED
@@ -13,8 +13,8 @@ from pymupdf import Document, Rect
|
|
13 |
import pymupdf
|
14 |
from PIL import ImageDraw, Image
|
15 |
|
16 |
-
from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
|
17 |
-
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes
|
18 |
from tools.helper_functions import get_file_name_without_type, detect_file_type
|
19 |
from tools.file_redaction import redact_page_with_pymupdf
|
20 |
|
@@ -731,9 +731,10 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
731 |
output_folder:str = OUTPUT_FOLDER,
|
732 |
save_pdf:bool=True,
|
733 |
page_sizes:List[dict]=[],
|
|
|
734 |
progress=gr.Progress(track_tqdm=True)):
|
735 |
'''
|
736 |
-
Apply modified redactions to a pymupdf and export review files
|
737 |
'''
|
738 |
|
739 |
output_files = []
|
@@ -849,7 +850,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
849 |
#try:
|
850 |
if pdf_doc:
|
851 |
out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
|
852 |
-
pdf_doc
|
853 |
output_files.append(out_pdf_file_path)
|
854 |
|
855 |
else:
|
|
|
13 |
import pymupdf
|
14 |
from PIL import ImageDraw, Image
|
15 |
|
16 |
+
from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER, COMPRESS_REDACTED_PDF
|
17 |
+
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes, save_pdf_with_or_without_compression
|
18 |
from tools.helper_functions import get_file_name_without_type, detect_file_type
|
19 |
from tools.file_redaction import redact_page_with_pymupdf
|
20 |
|
|
|
731 |
output_folder:str = OUTPUT_FOLDER,
|
732 |
save_pdf:bool=True,
|
733 |
page_sizes:List[dict]=[],
|
734 |
+
COMPRESS_REDACTED_PDF:bool=COMPRESS_REDACTED_PDF,
|
735 |
progress=gr.Progress(track_tqdm=True)):
|
736 |
'''
|
737 |
+
Apply modified redactions to a pymupdf and export review files.
|
738 |
'''
|
739 |
|
740 |
output_files = []
|
|
|
850 |
#try:
|
851 |
if pdf_doc:
|
852 |
out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
|
853 |
+
save_pdf_with_or_without_compression(pdf_doc, out_pdf_file_path, COMPRESS_REDACTED_PDF)
|
854 |
output_files.append(out_pdf_file_path)
|
855 |
|
856 |
else:
|
tools/textract_batch_call.py
CHANGED
@@ -11,11 +11,13 @@ from typing import List
|
|
11 |
from io import StringIO
|
12 |
from urllib.parse import urlparse
|
13 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
14 |
-
from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER
|
15 |
from tools.aws_functions import download_file_from_s3
|
16 |
from tools.file_conversion import get_input_file_names
|
17 |
from tools.helper_functions import get_file_name_without_type
|
18 |
|
|
|
|
|
19 |
def analyse_document_with_textract_api(
|
20 |
local_pdf_path: str,
|
21 |
s3_input_prefix: str,
|
@@ -164,7 +166,7 @@ def analyse_document_with_textract_api(
|
|
164 |
'file_name': pdf_filename,
|
165 |
'job_type': job_type,
|
166 |
'signature_extraction':analyse_signatures,
|
167 |
-
's3_location': job_location_full,
|
168 |
'job_date_time': datetime.datetime.now()
|
169 |
}])
|
170 |
|
@@ -402,6 +404,7 @@ def poll_whole_document_textract_analysis_progress_and_download(
|
|
402 |
load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
403 |
poll_interval_seconds: int = 1,
|
404 |
max_polling_attempts: int = 1, # ~10 minutes total wait time):
|
|
|
405 |
progress = gr.Progress(track_tqdm=True)
|
406 |
):
|
407 |
'''
|
@@ -446,19 +449,19 @@ def poll_whole_document_textract_analysis_progress_and_download(
|
|
446 |
else:
|
447 |
error = f"Unknown job type, cannot poll job"
|
448 |
print(error)
|
449 |
-
|
450 |
-
raise
|
451 |
|
452 |
except textract_client.exceptions.InvalidJobIdException:
|
453 |
-
error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than
|
454 |
print(error_message)
|
455 |
logging.error(error_message)
|
456 |
-
raise
|
457 |
except Exception as e:
|
458 |
error_message = f"Error while polling Textract status for job {job_id}: {e}"
|
459 |
print(error_message)
|
460 |
logging.error(error_message)
|
461 |
-
raise
|
462 |
|
463 |
downloaded_file_path = None
|
464 |
if job_status == 'SUCCEEDED':
|
@@ -514,11 +517,12 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
|
|
514 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
515 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
516 |
document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
|
517 |
-
aws_region:str=AWS_REGION
|
|
|
518 |
'''
|
519 |
Load in a dataframe of jobs previous submitted to the Textract API service.
|
520 |
'''
|
521 |
-
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','
|
522 |
|
523 |
# Initialize boto3 clients
|
524 |
session = boto3.Session(region_name=aws_region)
|
@@ -550,9 +554,14 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
|
|
550 |
|
551 |
if "job_date_time" in job_df.columns:
|
552 |
job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
|
553 |
-
# Keep only jobs that have been completed in the last
|
554 |
-
cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=
|
555 |
-
job_df = job_df.loc[job_df["job_date_time"]
|
|
|
|
|
|
|
|
|
|
|
556 |
|
557 |
return job_df
|
558 |
|
|
|
11 |
from io import StringIO
|
12 |
from urllib.parse import urlparse
|
13 |
from botocore.exceptions import ClientError, NoCredentialsError, PartialCredentialsError, TokenRetrievalError
|
14 |
+
from tools.config import TEXTRACT_WHOLE_DOCUMENT_ANALYSIS_BUCKET, OUTPUT_FOLDER, AWS_REGION, DOCUMENT_REDACTION_BUCKET, LOAD_PREVIOUS_TEXTRACT_JOBS_S3, TEXTRACT_JOBS_S3_LOC, TEXTRACT_JOBS_LOCAL_LOC, TEXTRACT_JOBS_S3_INPUT_LOC, RUN_AWS_FUNCTIONS, INPUT_FOLDER, DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS
|
15 |
from tools.aws_functions import download_file_from_s3
|
16 |
from tools.file_conversion import get_input_file_names
|
17 |
from tools.helper_functions import get_file_name_without_type
|
18 |
|
19 |
+
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
|
20 |
+
|
21 |
def analyse_document_with_textract_api(
|
22 |
local_pdf_path: str,
|
23 |
s3_input_prefix: str,
|
|
|
166 |
'file_name': pdf_filename,
|
167 |
'job_type': job_type,
|
168 |
'signature_extraction':analyse_signatures,
|
169 |
+
#'s3_location': job_location_full,
|
170 |
'job_date_time': datetime.datetime.now()
|
171 |
}])
|
172 |
|
|
|
404 |
load_jobs_from_s3:str = LOAD_PREVIOUS_TEXTRACT_JOBS_S3,
|
405 |
poll_interval_seconds: int = 1,
|
406 |
max_polling_attempts: int = 1, # ~10 minutes total wait time):
|
407 |
+
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS: int = DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS,
|
408 |
progress = gr.Progress(track_tqdm=True)
|
409 |
):
|
410 |
'''
|
|
|
449 |
else:
|
450 |
error = f"Unknown job type, cannot poll job"
|
451 |
print(error)
|
452 |
+
logging.error(f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed.")
|
453 |
+
raise Exception(error_message)
|
454 |
|
455 |
except textract_client.exceptions.InvalidJobIdException:
|
456 |
+
error_message = f"Invalid JobId: {job_id}. This might happen if the job expired (older than {DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS} days) or never existed."
|
457 |
print(error_message)
|
458 |
logging.error(error_message)
|
459 |
+
raise Exception(error_message)
|
460 |
except Exception as e:
|
461 |
error_message = f"Error while polling Textract status for job {job_id}: {e}"
|
462 |
print(error_message)
|
463 |
logging.error(error_message)
|
464 |
+
raise Exception(error_message)
|
465 |
|
466 |
downloaded_file_path = None
|
467 |
if job_status == 'SUCCEEDED':
|
|
|
517 |
load_s3_jobs_loc:str=TEXTRACT_JOBS_S3_LOC,
|
518 |
load_local_jobs_loc:str=TEXTRACT_JOBS_LOCAL_LOC,
|
519 |
document_redaction_bucket:str=DOCUMENT_REDACTION_BUCKET,
|
520 |
+
aws_region:str=AWS_REGION,
|
521 |
+
DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS:int=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS):
|
522 |
'''
|
523 |
Load in a dataframe of jobs previous submitted to the Textract API service.
|
524 |
'''
|
525 |
+
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','job_date_time'])
|
526 |
|
527 |
# Initialize boto3 clients
|
528 |
session = boto3.Session(region_name=aws_region)
|
|
|
554 |
|
555 |
if "job_date_time" in job_df.columns:
|
556 |
job_df["job_date_time"] = pd.to_datetime(job_df["job_date_time"], errors='coerce')
|
557 |
+
# Keep only jobs that have been completed in the last 'DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS' days
|
558 |
+
cutoff_time = pd.Timestamp.now() - pd.Timedelta(days=DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS)
|
559 |
+
job_df = job_df.loc[job_df["job_date_time"] > cutoff_time,:]
|
560 |
+
|
561 |
+
try:
|
562 |
+
job_df = job_df[['job_id','file_name','job_type','signature_extraction','job_date_time']]
|
563 |
+
except Exception as e:
|
564 |
+
print("Could not find one or more columns in Textract whole document list dataframe:", e)
|
565 |
|
566 |
return job_df
|
567 |
|