seanpedrickcase commited on
Commit
46bf91e
·
1 Parent(s): 52c1a90

Added button to convert Textract API outputs to ocr_output files easily. Corrected Textract job file location

Browse files
Files changed (2) hide show
  1. app.py +14 -3
  2. tools/textract_batch_call.py +8 -4
app.py CHANGED
@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  from tools.custom_csvlogger import CSVLogger_custom
17
  from tools.find_duplicate_pages import identify_similar_pages
18
- from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
19
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
@@ -153,6 +153,8 @@ with app:
153
  s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
154
  s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
155
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
 
 
156
 
157
  load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
158
  s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
@@ -263,8 +265,10 @@ with app:
263
  job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
264
  check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
265
  with gr.Row():
266
- job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
267
- textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
 
 
268
 
269
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
270
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
@@ -522,6 +526,13 @@ with app:
522
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
523
 
524
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
 
 
 
 
 
 
 
525
 
526
  ###
527
  # REVIEW PDF REDACTIONS
 
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  from tools.custom_csvlogger import CSVLogger_custom
17
  from tools.find_duplicate_pages import identify_similar_pages
18
+ from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist
19
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
 
153
  s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
154
  s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
155
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
156
+ no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
157
+ textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
158
 
159
  load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
160
  s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
 
265
  job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
266
  check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
267
  with gr.Row():
268
+ job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
269
+ textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
270
+
271
+ convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=True)
272
 
273
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
274
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
 
526
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
527
 
528
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
529
+
530
+
531
+ convert_textract_outputs_to_ocr_results.click(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
532
+ success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
533
+ success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
534
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
535
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path])
536
 
537
  ###
538
  # REVIEW PDF REDACTIONS
tools/textract_batch_call.py CHANGED
@@ -164,7 +164,7 @@ def analyse_document_with_textract_api(
164
  }])
165
 
166
  # File path
167
- log_file_path = os.path.join(local_output_dir, "textract_job_log_files.csv")
168
 
169
  # Check if file exists
170
  file_exists = os.path.exists(log_file_path)
@@ -454,8 +454,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
454
 
455
  if load_s3_jobs == 'True':
456
  s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
457
-
458
- print("s3_output_key:", s3_output_key)
459
 
460
  try:
461
  s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
@@ -523,4 +521,10 @@ def download_textract_output(job_id:str,
523
  s3_client.download_file(output_bucket, output_file_key, local_file_path)
524
  print(f"Output file downloaded to: {local_file_path}")
525
  except Exception as e:
526
- print(f"Error downloading file: {e}")
 
 
 
 
 
 
 
164
  }])
165
 
166
  # File path
167
+ log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
168
 
169
  # Check if file exists
170
  file_exists = os.path.exists(log_file_path)
 
454
 
455
  if load_s3_jobs == 'True':
456
  s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
 
 
457
 
458
  try:
459
  s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
 
521
  s3_client.download_file(output_bucket, output_file_key, local_file_path)
522
  print(f"Output file downloaded to: {local_file_path}")
523
  except Exception as e:
524
+ print(f"Error downloading file: {e}")
525
+
526
+ def check_textract_outputs_exist(textract_output_found_checkbox):
527
+ if textract_output_found_checkbox == True:
528
+ print("Textract outputs found")
529
+ return
530
+ else: raise Exception("Relevant Tetract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")