Commit
·
46bf91e
1
Parent(s):
52c1a90
Added button to convert Textract API outputs to ocr_output files easily. Corrected Textract job file location
Browse files- app.py +14 -3
- tools/textract_batch_call.py +8 -4
app.py
CHANGED
@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
|
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
from tools.custom_csvlogger import CSVLogger_custom
|
17 |
from tools.find_duplicate_pages import identify_similar_pages
|
18 |
-
from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
|
19 |
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
@@ -153,6 +153,8 @@ with app:
|
|
153 |
s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
154 |
s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
|
155 |
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
|
|
|
|
156 |
|
157 |
load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
|
158 |
s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
|
@@ -263,8 +265,10 @@ with app:
|
|
263 |
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
|
264 |
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
265 |
with gr.Row():
|
266 |
-
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
|
267 |
-
textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
|
|
|
|
|
268 |
|
269 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
270 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
@@ -522,6 +526,13 @@ with app:
|
|
522 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
523 |
|
524 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
525 |
|
526 |
###
|
527 |
# REVIEW PDF REDACTIONS
|
|
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
from tools.custom_csvlogger import CSVLogger_custom
|
17 |
from tools.find_duplicate_pages import identify_similar_pages
|
18 |
+
from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist
|
19 |
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
|
|
153 |
s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
154 |
s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
|
155 |
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
156 |
+
no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
|
157 |
+
textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
|
158 |
|
159 |
load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
|
160 |
s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
|
|
|
265 |
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
|
266 |
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
267 |
with gr.Row():
|
268 |
+
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
|
269 |
+
textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
|
270 |
+
|
271 |
+
convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=True)
|
272 |
|
273 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
274 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
|
|
526 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
527 |
|
528 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
529 |
+
|
530 |
+
|
531 |
+
convert_textract_outputs_to_ocr_results.click(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
532 |
+
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
533 |
+
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
|
534 |
+
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
535 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path])
|
536 |
|
537 |
###
|
538 |
# REVIEW PDF REDACTIONS
|
tools/textract_batch_call.py
CHANGED
@@ -164,7 +164,7 @@ def analyse_document_with_textract_api(
|
|
164 |
}])
|
165 |
|
166 |
# File path
|
167 |
-
log_file_path = os.path.join(local_output_dir, "
|
168 |
|
169 |
# Check if file exists
|
170 |
file_exists = os.path.exists(log_file_path)
|
@@ -454,8 +454,6 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
|
|
454 |
|
455 |
if load_s3_jobs == 'True':
|
456 |
s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
|
457 |
-
|
458 |
-
print("s3_output_key:", s3_output_key)
|
459 |
|
460 |
try:
|
461 |
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
@@ -523,4 +521,10 @@ def download_textract_output(job_id:str,
|
|
523 |
s3_client.download_file(output_bucket, output_file_key, local_file_path)
|
524 |
print(f"Output file downloaded to: {local_file_path}")
|
525 |
except Exception as e:
|
526 |
-
print(f"Error downloading file: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
}])
|
165 |
|
166 |
# File path
|
167 |
+
log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
|
168 |
|
169 |
# Check if file exists
|
170 |
file_exists = os.path.exists(log_file_path)
|
|
|
454 |
|
455 |
if load_s3_jobs == 'True':
|
456 |
s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
|
|
|
|
|
457 |
|
458 |
try:
|
459 |
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
|
|
521 |
s3_client.download_file(output_bucket, output_file_key, local_file_path)
|
522 |
print(f"Output file downloaded to: {local_file_path}")
|
523 |
except Exception as e:
|
524 |
+
print(f"Error downloading file: {e}")
|
525 |
+
|
526 |
+
def check_textract_outputs_exist(textract_output_found_checkbox):
|
527 |
+
if textract_output_found_checkbox == True:
|
528 |
+
print("Textract outputs found")
|
529 |
+
return
|
530 |
+
else: raise Exception("Relevant Tetract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")
|