Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Apr 22

Commit

46bf91e

1 Parent(s): 52c1a90

Added button to convert Textract API outputs to ocr_output files easily. Corrected Textract job file location

Browse files

Files changed (2) hide show

app.py +14 -3
tools/textract_batch_call.py +8 -4

app.py CHANGED Viewed

@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import identify_similar_pages
-from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
@@ -153,6 +153,8 @@ with app:
     s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
     s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
     load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
     s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
@@ -263,8 +265,10 @@ with app:
                             job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
                             check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
                     with gr.Row():
-                        job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
-                        textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
             document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
@@ -522,6 +526,13 @@ with app:
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
     ###
     # REVIEW PDF REDACTIONS

 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 from tools.find_duplicate_pages import identify_similar_pages
+from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist
 # Suppress downcasting warnings
 pd.set_option('future.no_silent_downcasting', True)
     s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
     s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
     successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
+    no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
+    textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
     load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
     s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
                             job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
                             check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
                     with gr.Row():
+                        job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
+                        textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
+                    convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=True)
             gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
             document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
     success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
     textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
+    convert_textract_outputs_to_ocr_results.click(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
+        success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
+        success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
+        success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
+                    outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path])
     ###
     # REVIEW PDF REDACTIONS

tools/textract_batch_call.py CHANGED Viewed

@@ -164,7 +164,7 @@ def analyse_document_with_textract_api(
         }])
         # File path
-        log_file_path = os.path.join(local_output_dir, "textract_job_log_files.csv")
         # Check if file exists
         file_exists = os.path.exists(log_file_path)
@@ -454,8 +454,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
     if load_s3_jobs == 'True':
         s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
-        print("s3_output_key:", s3_output_key)
         try:
             s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
@@ -523,4 +521,10 @@ def download_textract_output(job_id:str,
         s3_client.download_file(output_bucket, output_file_key, local_file_path)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
-        print(f"Error downloading file: {e}")

         }])
         # File path
+        log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
         # Check if file exists
         file_exists = os.path.exists(log_file_path)
     if load_s3_jobs == 'True':
         s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
         try:
             s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
         s3_client.download_file(output_bucket, output_file_key, local_file_path)
         print(f"Output file downloaded to: {local_file_path}")
     except Exception as e:
+        print(f"Error downloading file: {e}")
+def check_textract_outputs_exist(textract_output_found_checkbox):
+        if textract_output_found_checkbox == True:
+            print("Textract outputs found")
+            return
+        else: raise Exception("Relevant Tetract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")