Merge pull request #16 from seanpedrick-case/dev
Browse filesAdded id and text properties to annotation object. Other minor changes.
- app.py +34 -17
- requirements.txt +1 -2
- tools/auth.py +3 -1
- tools/aws_functions.py +0 -4
- tools/file_conversion.py +375 -183
- tools/file_redaction.py +35 -13
- tools/redaction_review.py +112 -34
- tools/textract_batch_call.py +10 -6
app.py
CHANGED
@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
|
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
from tools.custom_csvlogger import CSVLogger_custom
|
17 |
from tools.find_duplicate_pages import identify_similar_pages
|
18 |
-
from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
|
19 |
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
@@ -153,6 +153,8 @@ with app:
|
|
153 |
s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
154 |
s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
|
155 |
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
|
|
|
|
156 |
|
157 |
load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
|
158 |
s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
|
@@ -164,7 +166,7 @@ with app:
|
|
164 |
default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
|
165 |
|
166 |
# Base tables that are not modified subsequent to load
|
167 |
-
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=
|
168 |
all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
|
169 |
all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
|
170 |
cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
|
@@ -203,6 +205,7 @@ with app:
|
|
203 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
204 |
|
205 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
|
|
206 |
|
207 |
###
|
208 |
# UI DESIGN
|
@@ -263,8 +266,10 @@ with app:
|
|
263 |
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
|
264 |
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
265 |
with gr.Row():
|
266 |
-
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
|
267 |
-
textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
|
|
|
|
|
268 |
|
269 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
270 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
@@ -298,8 +303,8 @@ with app:
|
|
298 |
with gr.Column(scale=2):
|
299 |
with gr.Row(equal_height=True):
|
300 |
annotation_last_page_button = gr.Button("Previous page", scale = 4)
|
301 |
-
annotate_current_page = gr.Number(value=
|
302 |
-
annotate_max_pages = gr.Number(value=
|
303 |
annotation_next_page_button = gr.Button("Next page", scale = 4)
|
304 |
|
305 |
zoom_str = str(annotator_zoom_number) + '%'
|
@@ -336,7 +341,7 @@ with app:
|
|
336 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
337 |
page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
|
338 |
text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
|
339 |
-
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(
|
340 |
|
341 |
with gr.Row(equal_height=True):
|
342 |
exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
|
@@ -346,7 +351,9 @@ with app:
|
|
346 |
|
347 |
undo_last_removal_btn = gr.Button(value="Undo last element removal")
|
348 |
|
349 |
-
selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=
|
|
|
|
|
350 |
|
351 |
with gr.Accordion("Search all extracted text", open=True):
|
352 |
all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
|
@@ -520,6 +527,13 @@ with app:
|
|
520 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
521 |
|
522 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
523 |
|
524 |
###
|
525 |
# REVIEW PDF REDACTIONS
|
@@ -546,17 +560,22 @@ with app:
|
|
546 |
|
547 |
# Apply page redactions
|
548 |
annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
|
|
|
|
|
|
|
|
|
|
|
549 |
|
550 |
# Review table controls
|
551 |
recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
|
552 |
page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
|
553 |
text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
|
554 |
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
|
561 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
562 |
|
@@ -577,9 +596,7 @@ with app:
|
|
577 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
578 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
579 |
|
580 |
-
|
581 |
-
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
582 |
-
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
583 |
|
584 |
# Review OCR text buttom
|
585 |
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
|
@@ -717,7 +734,7 @@ if __name__ == "__main__":
|
|
717 |
if RUN_DIRECT_MODE == "0":
|
718 |
|
719 |
if os.environ['COGNITO_AUTH'] == "1":
|
720 |
-
app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
|
721 |
else:
|
722 |
app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
|
723 |
|
|
|
15 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
16 |
from tools.custom_csvlogger import CSVLogger_custom
|
17 |
from tools.find_duplicate_pages import identify_similar_pages
|
18 |
+
from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist
|
19 |
|
20 |
# Suppress downcasting warnings
|
21 |
pd.set_option('future.no_silent_downcasting', True)
|
|
|
153 |
s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
|
154 |
s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
|
155 |
successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
|
156 |
+
no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
|
157 |
+
textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
|
158 |
|
159 |
load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
|
160 |
s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
|
|
|
166 |
default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
|
167 |
|
168 |
# Base tables that are not modified subsequent to load
|
169 |
+
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
|
170 |
all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
|
171 |
all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
|
172 |
cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
|
|
|
205 |
job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
|
206 |
|
207 |
textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
|
208 |
+
convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
|
209 |
|
210 |
###
|
211 |
# UI DESIGN
|
|
|
266 |
job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
|
267 |
check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
|
268 |
with gr.Row():
|
269 |
+
job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
|
270 |
+
textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
|
271 |
+
|
272 |
+
convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=True)
|
273 |
|
274 |
gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
|
275 |
document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
|
|
|
303 |
with gr.Column(scale=2):
|
304 |
with gr.Row(equal_height=True):
|
305 |
annotation_last_page_button = gr.Button("Previous page", scale = 4)
|
306 |
+
annotate_current_page = gr.Number(value=1, label="Current page", precision=0, scale = 2, min_width=50)
|
307 |
+
annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
|
308 |
annotation_next_page_button = gr.Button("Next page", scale = 4)
|
309 |
|
310 |
zoom_str = str(annotator_zoom_number) + '%'
|
|
|
341 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
342 |
page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
|
343 |
text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
|
344 |
+
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=(4,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
|
345 |
|
346 |
with gr.Row(equal_height=True):
|
347 |
exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
|
|
|
351 |
|
352 |
undo_last_removal_btn = gr.Button(value="Undo last element removal")
|
353 |
|
354 |
+
selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True)
|
355 |
+
selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
|
356 |
+
selected_entity_colour = gr.Textbox(value="", label="selected_entity_colour", visible=False)
|
357 |
|
358 |
with gr.Accordion("Search all extracted text", open=True):
|
359 |
all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
|
|
|
527 |
success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
|
528 |
|
529 |
textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
|
530 |
+
|
531 |
+
|
532 |
+
convert_textract_outputs_to_ocr_results.click(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
|
533 |
+
success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
|
534 |
+
success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
|
535 |
+
success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
|
536 |
+
outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path])
|
537 |
|
538 |
###
|
539 |
# REVIEW PDF REDACTIONS
|
|
|
560 |
|
561 |
# Apply page redactions
|
562 |
annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
|
563 |
+
|
564 |
+
# Save current page redactions
|
565 |
+
update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
|
566 |
+
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
567 |
+
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
568 |
|
569 |
# Review table controls
|
570 |
recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
|
571 |
page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
|
572 |
text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
|
573 |
|
574 |
+
# Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
|
575 |
+
recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page, selected_entity_dataframe_row]).\
|
576 |
+
success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour, page_sizes], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
|
577 |
+
success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, annotate_current_page, annotate_previous_page, all_image_annotations_state, annotator], outputs=[annotator, all_image_annotations_state])
|
578 |
+
|
579 |
reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
|
580 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
|
581 |
|
|
|
596 |
success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
|
597 |
success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
|
598 |
|
599 |
+
|
|
|
|
|
600 |
|
601 |
# Review OCR text buttom
|
602 |
all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
|
|
|
734 |
if RUN_DIRECT_MODE == "0":
|
735 |
|
736 |
if os.environ['COGNITO_AUTH'] == "1":
|
737 |
+
app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
|
738 |
else:
|
739 |
app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
|
740 |
|
requirements.txt
CHANGED
@@ -10,7 +10,6 @@ pandas==2.2.3
|
|
10 |
scikit-learn==1.6.1
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
13 |
-
#en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
14 |
gradio==5.25.2
|
15 |
boto3==1.37.29
|
16 |
pyarrow==19.0.1
|
@@ -19,7 +18,7 @@ Faker==36.1.1
|
|
19 |
python-levenshtein==0.26.1
|
20 |
spaczz==0.6.1
|
21 |
# The following version
|
22 |
-
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.
|
23 |
rapidfuzz==3.12.1
|
24 |
python-dotenv==1.0.1
|
25 |
numpy==1.26.4
|
|
|
10 |
scikit-learn==1.6.1
|
11 |
spacy==3.8.4
|
12 |
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
|
|
13 |
gradio==5.25.2
|
14 |
boto3==1.37.29
|
15 |
pyarrow==19.0.1
|
|
|
18 |
python-levenshtein==0.26.1
|
19 |
spaczz==0.6.1
|
20 |
# The following version
|
21 |
+
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.2/gradio_image_annotation-0.3.2-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
22 |
rapidfuzz==3.12.1
|
23 |
python-dotenv==1.0.1
|
24 |
numpy==1.26.4
|
tools/auth.py
CHANGED
@@ -69,5 +69,7 @@ def authenticate_user(username:str, password:str, user_pool_id:str=AWS_USER_POOL
|
|
69 |
except client.exceptions.UserNotFoundException:
|
70 |
return False
|
71 |
except Exception as e:
|
72 |
-
|
|
|
|
|
73 |
return False
|
|
|
69 |
except client.exceptions.UserNotFoundException:
|
70 |
return False
|
71 |
except Exception as e:
|
72 |
+
out_message = f"An error occurred: {e}"
|
73 |
+
print(out_message)
|
74 |
+
raise Exception(out_message)
|
75 |
return False
|
tools/aws_functions.py
CHANGED
@@ -42,10 +42,6 @@ def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str
|
|
42 |
if RUN_AWS_FUNCTIONS == "1":
|
43 |
|
44 |
try:
|
45 |
-
print("bucket_name:", bucket_name)
|
46 |
-
print("key:", key)
|
47 |
-
print("local_file_path_and_name:", local_file_path_and_name)
|
48 |
-
|
49 |
# Ensure the local directory exists
|
50 |
os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
|
51 |
|
|
|
42 |
if RUN_AWS_FUNCTIONS == "1":
|
43 |
|
44 |
try:
|
|
|
|
|
|
|
|
|
45 |
# Ensure the local directory exists
|
46 |
os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
|
47 |
|
tools/file_conversion.py
CHANGED
@@ -19,6 +19,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
19 |
from pdf2image import convert_from_path
|
20 |
from PIL import Image
|
21 |
from scipy.spatial import cKDTree
|
|
|
|
|
22 |
|
23 |
IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
|
24 |
|
@@ -834,10 +836,10 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
|
|
834 |
# Filter items with non-empty boxes
|
835 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
836 |
|
837 |
-
# Remove 'text' elements from boxes
|
838 |
-
for item in non_empty_boxes:
|
839 |
-
|
840 |
-
|
841 |
|
842 |
if non_empty_boxes:
|
843 |
# Keep the first entry with non-empty boxes
|
@@ -855,13 +857,19 @@ def divide_coordinates_by_page_sizes(review_file_df:pd.DataFrame, page_sizes_df:
|
|
855 |
review_file_df_out = review_file_df
|
856 |
|
857 |
if xmin in review_file_df.columns and not review_file_df.empty:
|
|
|
|
|
|
|
|
|
858 |
review_file_df_orig = review_file_df.copy().loc[(review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) & (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1),:]
|
|
|
|
|
859 |
|
860 |
-
|
861 |
|
862 |
-
|
863 |
|
864 |
-
review_file_df_div =
|
865 |
|
866 |
if "image_width" not in review_file_df_div.columns and not page_sizes_df.empty:
|
867 |
|
@@ -902,6 +910,11 @@ def multiply_coordinates_by_page_sizes(review_file_df: pd.DataFrame, page_sizes_
|
|
902 |
|
903 |
|
904 |
if xmin in review_file_df.columns and not review_file_df.empty:
|
|
|
|
|
|
|
|
|
|
|
905 |
# Separate absolute vs relative coordinates
|
906 |
review_file_df_orig = review_file_df.loc[
|
907 |
(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) &
|
@@ -1014,6 +1027,12 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
|
|
1014 |
if not 'text' in df2.columns: df2['text'] = ''
|
1015 |
if not 'text' in df1.columns: df1['text'] = ''
|
1016 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1017 |
# Create a unique key based on coordinates and label for exact merge
|
1018 |
merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
|
1019 |
df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
|
@@ -1031,6 +1050,8 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
|
|
1031 |
|
1032 |
# Handle missing matches using a proximity-based approach
|
1033 |
# Convert coordinates to numpy arrays for KDTree lookup
|
|
|
|
|
1034 |
query_coords = np.array(df1[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=float)
|
1035 |
|
1036 |
# Check for NaN or infinite values in query_coords and filter them out
|
@@ -1064,9 +1085,6 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
|
|
1064 |
|
1065 |
return merged_df
|
1066 |
|
1067 |
-
|
1068 |
-
|
1069 |
-
|
1070 |
def _extract_page_number(image_path: Any) -> int:
|
1071 |
"""Helper function to safely extract page number."""
|
1072 |
if not isinstance(image_path, str):
|
@@ -1085,7 +1103,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
1085 |
'''
|
1086 |
if not all_annotations:
|
1087 |
# Return an empty DataFrame with the expected schema if input is empty
|
1088 |
-
return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text"])
|
1089 |
|
1090 |
# 1. Create initial DataFrame from the list of annotations
|
1091 |
# Use list comprehensions with .get() for robustness
|
@@ -1102,7 +1120,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
1102 |
# Explode removes rows where the list is empty. We want to keep them
|
1103 |
# as rows with NA values. Replace empty lists with a list containing
|
1104 |
# a single placeholder dictionary.
|
1105 |
-
placeholder_box = {"xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA, "text": pd.NA}
|
1106 |
df['boxes'] = df['boxes'].apply(lambda x: x if x else [placeholder_box])
|
1107 |
|
1108 |
# 4. Explode the 'boxes' column. Each item in the list becomes a new row.
|
@@ -1124,7 +1142,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
1124 |
# prevents this from being necessary.
|
1125 |
|
1126 |
# 7. Ensure essential columns exist and set column order
|
1127 |
-
essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text"]
|
1128 |
for col in essential_box_cols:
|
1129 |
if col not in final_df.columns:
|
1130 |
final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
|
@@ -1140,71 +1158,6 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
|
|
1140 |
|
1141 |
return final_df
|
1142 |
|
1143 |
-
|
1144 |
-
# def convert_annotation_data_to_dataframe(all_annotations:List[dict]):
|
1145 |
-
# '''
|
1146 |
-
# Convert an annotation list of dictionaries to a dataframe with all boxes on a separate row
|
1147 |
-
# '''
|
1148 |
-
# # Flatten the data
|
1149 |
-
# flattened_annotation_data = []
|
1150 |
-
|
1151 |
-
# for annotation in all_annotations:
|
1152 |
-
# image_path = annotation["image"]
|
1153 |
-
|
1154 |
-
# if image_path:
|
1155 |
-
# match = re.search(r'_(\d+)\.png$', image_path)
|
1156 |
-
# if match:
|
1157 |
-
# number = match.group(1)
|
1158 |
-
# reported_number = int(number) + 1
|
1159 |
-
# else:
|
1160 |
-
# reported_number = 1
|
1161 |
-
# else:
|
1162 |
-
# reported_number = 1
|
1163 |
-
|
1164 |
-
# # Check if 'boxes' is in the annotation, if not, add an empty list
|
1165 |
-
# if 'boxes' not in annotation:
|
1166 |
-
# annotation['boxes'] = []
|
1167 |
-
|
1168 |
-
# # If boxes are empty, create a row with blank values for xmin, xmax, ymin, ymax
|
1169 |
-
# if not annotation["boxes"]:
|
1170 |
-
# data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA}
|
1171 |
-
# flattened_annotation_data.append(data_to_add)
|
1172 |
-
# else:
|
1173 |
-
# for box in annotation["boxes"]:
|
1174 |
-
# if 'xmin' not in box:
|
1175 |
-
# data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, 'xmax': pd.NA, 'ymin': pd.NA, 'ymax': pd.NA}
|
1176 |
-
# elif 'text' not in box:
|
1177 |
-
# data_to_add = {"image": image_path, "page": reported_number, **box}
|
1178 |
-
# else:
|
1179 |
-
# data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
|
1180 |
-
# flattened_annotation_data.append(data_to_add)
|
1181 |
-
|
1182 |
-
# # Convert to a DataFrame
|
1183 |
-
# review_file_df = pd.DataFrame(flattened_annotation_data)
|
1184 |
-
|
1185 |
-
# return review_file_df
|
1186 |
-
|
1187 |
-
# def create_annotation_dicts_from_annotation_df(all_image_annotations_df:pd.DataFrame, page_sizes:List[dict]):
|
1188 |
-
# '''
|
1189 |
-
# From an annotation object as a dataframe, convert back to a list of dictionaries that can be used in the Gradio Image Annotator component
|
1190 |
-
# '''
|
1191 |
-
# result = []
|
1192 |
-
|
1193 |
-
# # Ensure that every page has an entry in the resulting list of dicts
|
1194 |
-
# for image_path in page_sizes:
|
1195 |
-
# annotation = {}
|
1196 |
-
# annotation["image"] = image_path["image_path"]
|
1197 |
-
# annotation["boxes"] = []
|
1198 |
-
|
1199 |
-
# result.append(annotation)
|
1200 |
-
|
1201 |
-
# # Then add in all the filled in data
|
1202 |
-
# for image, group in all_image_annotations_df.groupby('image'):
|
1203 |
-
# boxes = group[['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']].to_dict(orient='records')
|
1204 |
-
# result.append({'image': image, 'boxes': boxes})
|
1205 |
-
|
1206 |
-
# return result
|
1207 |
-
|
1208 |
def create_annotation_dicts_from_annotation_df(
|
1209 |
all_image_annotations_df: pd.DataFrame,
|
1210 |
page_sizes: List[Dict[str, Any]]
|
@@ -1228,9 +1181,12 @@ def create_annotation_dicts_from_annotation_df(
|
|
1228 |
|
1229 |
# 2. Define columns to extract for boxes and check availability
|
1230 |
# Make sure these columns actually exist in the DataFrame
|
1231 |
-
box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
|
1232 |
available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
|
1233 |
|
|
|
|
|
|
|
1234 |
if not available_cols:
|
1235 |
print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
|
1236 |
return list(image_dict.values()) # Return based on page_sizes only
|
@@ -1248,7 +1204,6 @@ def create_annotation_dicts_from_annotation_df(
|
|
1248 |
print("Warning: No valid annotation rows found in DataFrame after dropping NA coordinates.")
|
1249 |
return list(image_dict.values())
|
1250 |
|
1251 |
-
|
1252 |
# Process groups
|
1253 |
try:
|
1254 |
for image_path, group in valid_box_df.groupby('image', observed=True, sort=False):
|
@@ -1271,122 +1226,353 @@ def create_annotation_dicts_from_annotation_df(
|
|
1271 |
|
1272 |
return result
|
1273 |
|
1274 |
-
|
1275 |
-
|
1276 |
-
|
1277 |
-
|
1278 |
-
# all_image_annotations_df: pd.DataFrame,
|
1279 |
-
# page_sizes: List[Dict[str, Any]]
|
1280 |
-
# ) -> List[Dict[str, Any]]:
|
1281 |
-
# '''
|
1282 |
-
# Convert annotation DataFrame back to list of dicts using Pandas merge.
|
1283 |
-
# Ensures all images from page_sizes are present without duplicates.
|
1284 |
-
# '''
|
1285 |
-
# # 1. Create a DataFrame containing all required image paths from page_sizes
|
1286 |
-
# if not page_sizes:
|
1287 |
-
# return []
|
1288 |
-
# all_image_paths = [item.get("image_path") for item in page_sizes if item.get("image_path")]
|
1289 |
-
# if not all_image_paths:
|
1290 |
-
# return []
|
1291 |
-
# # Use unique paths
|
1292 |
-
# pages_df = pd.DataFrame({'image': list(set(all_image_paths))})
|
1293 |
-
|
1294 |
-
# # Check if the DataFrame is empty or lacks necessary columns
|
1295 |
-
# if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
|
1296 |
-
# print("Warning: Annotation DataFrame is empty or missing 'image' column.")
|
1297 |
-
# # Add empty boxes column and return
|
1298 |
-
# pages_df['boxes'] = [[] for _ in range(len(pages_df))]
|
1299 |
-
# return pages_df.to_dict(orient='records')
|
1300 |
-
|
1301 |
-
# # 2. Define columns to extract and check availability
|
1302 |
-
# box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
|
1303 |
-
# available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
|
1304 |
-
|
1305 |
-
# if not available_cols:
|
1306 |
-
# print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
|
1307 |
-
# pages_df['boxes'] = [[] for _ in range(len(pages_df))]
|
1308 |
-
# return pages_df.to_dict(orient='records')
|
1309 |
-
|
1310 |
-
# # 3. Prepare the annotation data: drop invalid rows and aggregate boxes
|
1311 |
-
# coord_cols = ['xmin', 'ymin', 'xmax', 'ymax']
|
1312 |
-
# valid_box_df = all_image_annotations_df.dropna(
|
1313 |
-
# subset=[col for col in coord_cols if col in available_cols]
|
1314 |
-
# ).copy() # Use .copy()
|
1315 |
-
|
1316 |
-
# if valid_box_df.empty:
|
1317 |
-
# print("Warning: No valid annotation rows found after dropping NA coordinates.")
|
1318 |
-
# pages_df['boxes'] = [[] for _ in range(len(pages_df))]
|
1319 |
-
# return pages_df.to_dict(orient='records')
|
1320 |
-
|
1321 |
-
|
1322 |
-
# # Aggregate boxes into lists of dictionaries per image
|
1323 |
-
# def aggregate_boxes(group):
|
1324 |
-
# return group[available_cols].to_dict(orient='records')
|
1325 |
-
|
1326 |
-
# # Group by image and apply the aggregation
|
1327 |
-
# grouped_boxes = valid_box_df.groupby('image', observed=True, sort=False).apply(aggregate_boxes).reset_index(name='boxes')
|
1328 |
-
|
1329 |
-
# # 4. Perform a left merge: keep all images from pages_df, add boxes where they exist
|
1330 |
-
# merged_df = pd.merge(pages_df, grouped_boxes, on='image', how='left')
|
1331 |
-
|
1332 |
-
# # 5. Fill NaN in 'boxes' column (for images with no annotations) with empty lists
|
1333 |
-
# # Ensure the column exists before trying to fillna
|
1334 |
-
# if 'boxes' in merged_df.columns:
|
1335 |
-
# # Use apply with a lambda for robust filling of NAs or potential None values
|
1336 |
-
# merged_df['boxes'] = merged_df['boxes'].apply(lambda x: [] if pd.isna(x) else x)
|
1337 |
-
# else:
|
1338 |
-
# # Should not happen with left merge, but handle defensively
|
1339 |
-
# merged_df['boxes'] = [[] for _ in range(len(merged_df))]
|
1340 |
-
|
1341 |
-
|
1342 |
-
# # 6. Convert the final DataFrame to the list of dictionaries format
|
1343 |
-
# result = merged_df.to_dict(orient='records')
|
1344 |
-
|
1345 |
-
# return result
|
1346 |
-
|
1347 |
-
def convert_annotation_json_to_review_df(all_annotations:List[dict],
|
1348 |
-
redaction_decision_output:pd.DataFrame=pd.DataFrame(),
|
1349 |
-
page_sizes:pd.DataFrame=pd.DataFrame(),
|
1350 |
-
do_proximity_match:bool=True) -> pd.DataFrame:
|
1351 |
'''
|
1352 |
-
Convert the annotation json data to a dataframe format.
|
|
|
|
|
1353 |
'''
|
1354 |
-
|
1355 |
-
review_file_df = convert_annotation_data_to_dataframe(all_annotations)
|
1356 |
-
|
1357 |
-
if page_sizes:
|
1358 |
-
page_sizes_df = pd.DataFrame(page_sizes)
|
1359 |
-
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
1360 |
-
|
1361 |
-
review_file_df = divide_coordinates_by_page_sizes(review_file_df, page_sizes_df)
|
1362 |
-
|
1363 |
-
redaction_decision_output = divide_coordinates_by_page_sizes(redaction_decision_output, page_sizes_df)
|
1364 |
|
1365 |
-
#
|
1366 |
-
|
|
|
1367 |
|
1368 |
-
|
1369 |
-
review_file_df = do_proximity_match_all_pages_for_text(df1 = review_file_df.copy(), df2 = redaction_decision_output.copy())
|
1370 |
-
|
1371 |
-
# Ensure required columns exist, filling with blank if they don't
|
1372 |
-
check_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]
|
1373 |
|
1374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1375 |
if col not in review_file_df.columns:
|
|
|
|
|
1376 |
review_file_df[col] = ''
|
1377 |
|
1378 |
-
|
1379 |
-
|
1380 |
-
else:
|
1381 |
-
review_file_df = pd.DataFrame(columns=check_columns)
|
1382 |
|
|
|
1383 |
# If colours are saved as list, convert to tuple
|
1384 |
-
|
|
|
1385 |
|
1386 |
-
|
|
|
|
|
|
|
|
|
|
|
1387 |
|
1388 |
return review_file_df
|
1389 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1390 |
def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
|
1391 |
image_paths:List[Image.Image],
|
1392 |
page_sizes:List[dict]=[]) -> List[dict]:
|
@@ -1404,9 +1590,15 @@ def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
|
|
1404 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
1405 |
|
1406 |
review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
1407 |
|
1408 |
# Keep only necessary columns
|
1409 |
-
review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax"]].drop_duplicates(subset=["image", "page", "xmin", "ymin", "xmax", "ymax", "label"])
|
1410 |
|
1411 |
# If colours are saved as list, convert to tuple
|
1412 |
review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
|
|
19 |
from pdf2image import convert_from_path
|
20 |
from PIL import Image
|
21 |
from scipy.spatial import cKDTree
|
22 |
+
import random
|
23 |
+
import string
|
24 |
|
25 |
IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
|
26 |
|
|
|
836 |
# Filter items with non-empty boxes
|
837 |
non_empty_boxes = [item for item in items if item.get('boxes')]
|
838 |
|
839 |
+
# Remove 'text' elements from boxes (deprecated)
|
840 |
+
#for item in non_empty_boxes:
|
841 |
+
# if 'boxes' in item:
|
842 |
+
# item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
|
843 |
|
844 |
if non_empty_boxes:
|
845 |
# Keep the first entry with non-empty boxes
|
|
|
857 |
review_file_df_out = review_file_df
|
858 |
|
859 |
if xmin in review_file_df.columns and not review_file_df.empty:
|
860 |
+
coord_cols = [xmin, xmax, ymin, ymax]
|
861 |
+
for col in coord_cols:
|
862 |
+
review_file_df.loc[:, col] = pd.to_numeric(review_file_df[col], errors="coerce")
|
863 |
+
|
864 |
review_file_df_orig = review_file_df.copy().loc[(review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) & (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1),:]
|
865 |
+
|
866 |
+
#print("review_file_df_orig:", review_file_df_orig)
|
867 |
|
868 |
+
review_file_df_div = review_file_df.loc[(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) & (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1),:]
|
869 |
|
870 |
+
#print("review_file_df_div:", review_file_df_div)
|
871 |
|
872 |
+
review_file_df_div.loc[:, "page"] = pd.to_numeric(review_file_df_div["page"], errors="coerce")
|
873 |
|
874 |
if "image_width" not in review_file_df_div.columns and not page_sizes_df.empty:
|
875 |
|
|
|
910 |
|
911 |
|
912 |
if xmin in review_file_df.columns and not review_file_df.empty:
|
913 |
+
|
914 |
+
coord_cols = [xmin, xmax, ymin, ymax]
|
915 |
+
for col in coord_cols:
|
916 |
+
review_file_df.loc[:, col] = pd.to_numeric(review_file_df[col], errors="coerce")
|
917 |
+
|
918 |
# Separate absolute vs relative coordinates
|
919 |
review_file_df_orig = review_file_df.loc[
|
920 |
(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) &
|
|
|
1027 |
if not 'text' in df2.columns: df2['text'] = ''
|
1028 |
if not 'text' in df1.columns: df1['text'] = ''
|
1029 |
|
1030 |
+
for col in ['xmin', 'ymin', 'xmax', 'ymax']:
|
1031 |
+
df1[col] = pd.to_numeric(df1[col], errors='coerce')
|
1032 |
+
|
1033 |
+
for col in ['xmin', 'ymin', 'xmax', 'ymax']:
|
1034 |
+
df2[col] = pd.to_numeric(df2[col], errors='coerce')
|
1035 |
+
|
1036 |
# Create a unique key based on coordinates and label for exact merge
|
1037 |
merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
|
1038 |
df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
|
|
|
1050 |
|
1051 |
# Handle missing matches using a proximity-based approach
|
1052 |
# Convert coordinates to numpy arrays for KDTree lookup
|
1053 |
+
|
1054 |
+
|
1055 |
query_coords = np.array(df1[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=float)
|
1056 |
|
1057 |
# Check for NaN or infinite values in query_coords and filter them out
|
|
|
1085 |
|
1086 |
return merged_df
|
1087 |
|
|
|
|
|
|
|
1088 |
def _extract_page_number(image_path: Any) -> int:
|
1089 |
"""Helper function to safely extract page number."""
|
1090 |
if not isinstance(image_path, str):
|
|
|
1103 |
'''
|
1104 |
if not all_annotations:
|
1105 |
# Return an empty DataFrame with the expected schema if input is empty
|
1106 |
+
return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
1107 |
|
1108 |
# 1. Create initial DataFrame from the list of annotations
|
1109 |
# Use list comprehensions with .get() for robustness
|
|
|
1120 |
# Explode removes rows where the list is empty. We want to keep them
|
1121 |
# as rows with NA values. Replace empty lists with a list containing
|
1122 |
# a single placeholder dictionary.
|
1123 |
+
placeholder_box = {"xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA, "text": pd.NA, "id": pd.NA}
|
1124 |
df['boxes'] = df['boxes'].apply(lambda x: x if x else [placeholder_box])
|
1125 |
|
1126 |
# 4. Explode the 'boxes' column. Each item in the list becomes a new row.
|
|
|
1142 |
# prevents this from being necessary.
|
1143 |
|
1144 |
# 7. Ensure essential columns exist and set column order
|
1145 |
+
essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id"]
|
1146 |
for col in essential_box_cols:
|
1147 |
if col not in final_df.columns:
|
1148 |
final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
|
|
|
1158 |
|
1159 |
return final_df
|
1160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1161 |
def create_annotation_dicts_from_annotation_df(
|
1162 |
all_image_annotations_df: pd.DataFrame,
|
1163 |
page_sizes: List[Dict[str, Any]]
|
|
|
1181 |
|
1182 |
# 2. Define columns to extract for boxes and check availability
|
1183 |
# Make sure these columns actually exist in the DataFrame
|
1184 |
+
box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label', 'text', 'id']
|
1185 |
available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
|
1186 |
|
1187 |
+
if 'text' in all_image_annotations_df.columns:
|
1188 |
+
all_image_annotations_df.loc[all_image_annotations_df['text'].isnull(), 'text'] = ''
|
1189 |
+
|
1190 |
if not available_cols:
|
1191 |
print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
|
1192 |
return list(image_dict.values()) # Return based on page_sizes only
|
|
|
1204 |
print("Warning: No valid annotation rows found in DataFrame after dropping NA coordinates.")
|
1205 |
return list(image_dict.values())
|
1206 |
|
|
|
1207 |
# Process groups
|
1208 |
try:
|
1209 |
for image_path, group in valid_box_df.groupby('image', observed=True, sort=False):
|
|
|
1226 |
|
1227 |
return result
|
1228 |
|
1229 |
+
def convert_annotation_json_to_review_df(all_annotations: List[dict],
|
1230 |
+
redaction_decision_output: pd.DataFrame = pd.DataFrame(),
|
1231 |
+
page_sizes: List[dict] = [],
|
1232 |
+
do_proximity_match: bool = True) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1233 |
'''
|
1234 |
+
Convert the annotation json data to a dataframe format.
|
1235 |
+
Add on any text from the initial review_file dataframe by joining based on 'id' if available
|
1236 |
+
in both sources, otherwise falling back to joining on pages/co-ordinates (if option selected).
|
1237 |
'''
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1238 |
|
1239 |
+
# 1. Convert annotations to DataFrame
|
1240 |
+
# Ensure convert_annotation_data_to_dataframe populates the 'id' column
|
1241 |
+
# if 'id' exists in the dictionaries within all_annotations.
|
1242 |
|
1243 |
+
review_file_df = convert_annotation_data_to_dataframe(all_annotations)
|
|
|
|
|
|
|
|
|
1244 |
|
1245 |
+
# Only keep rows in review_df where there are coordinates
|
1246 |
+
review_file_df.dropna(subset='xmin', axis=0, inplace=True)
|
1247 |
+
|
1248 |
+
# Exit early if the initial conversion results in an empty DataFrame
|
1249 |
+
if review_file_df.empty:
|
1250 |
+
# Define standard columns for an empty return DataFrame
|
1251 |
+
check_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text", "id"]
|
1252 |
+
# Ensure 'id' is included if it might have been expected
|
1253 |
+
return pd.DataFrame(columns=[col for col in check_columns if col != 'id' or 'id' in review_file_df.columns])
|
1254 |
+
|
1255 |
+
# 2. Handle page sizes if provided
|
1256 |
+
if not page_sizes:
|
1257 |
+
page_sizes_df = pd.DataFrame(page_sizes) # Ensure it's a DataFrame
|
1258 |
+
# Safely convert page column to numeric
|
1259 |
+
page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
|
1260 |
+
page_sizes_df.dropna(subset=["page"], inplace=True) # Drop rows where conversion failed
|
1261 |
+
page_sizes_df["page"] = page_sizes_df["page"].astype(int) # Convert to int after handling errors/NaNs
|
1262 |
+
|
1263 |
+
|
1264 |
+
# Apply coordinate division if page_sizes_df is not empty after processing
|
1265 |
+
if not page_sizes_df.empty:
|
1266 |
+
# Ensure 'page' column in review_file_df is numeric for merging
|
1267 |
+
if 'page' in review_file_df.columns:
|
1268 |
+
review_file_df['page'] = pd.to_numeric(review_file_df['page'], errors='coerce')
|
1269 |
+
# Drop rows with invalid pages before division
|
1270 |
+
review_file_df.dropna(subset=['page'], inplace=True)
|
1271 |
+
review_file_df['page'] = review_file_df['page'].astype(int)
|
1272 |
+
review_file_df = divide_coordinates_by_page_sizes(review_file_df, page_sizes_df)
|
1273 |
+
|
1274 |
+
print("review_file_df after coord divide:", review_file_df)
|
1275 |
+
|
1276 |
+
# Also apply to redaction_decision_output if it's not empty and has page numbers
|
1277 |
+
if not redaction_decision_output.empty and 'page' in redaction_decision_output.columns:
|
1278 |
+
redaction_decision_output['page'] = pd.to_numeric(redaction_decision_output['page'], errors='coerce')
|
1279 |
+
# Drop rows with invalid pages before division
|
1280 |
+
redaction_decision_output.dropna(subset=['page'], inplace=True)
|
1281 |
+
redaction_decision_output['page'] = redaction_decision_output['page'].astype(int)
|
1282 |
+
redaction_decision_output = divide_coordinates_by_page_sizes(redaction_decision_output, page_sizes_df)
|
1283 |
+
|
1284 |
+
print("redaction_decision_output after coord divide:", redaction_decision_output)
|
1285 |
+
else:
|
1286 |
+
print("Warning: Page sizes DataFrame became empty after processing, skipping coordinate division.")
|
1287 |
+
|
1288 |
+
|
1289 |
+
# 3. Join additional data from redaction_decision_output if provided
|
1290 |
+
if not redaction_decision_output.empty:
|
1291 |
+
# --- NEW LOGIC: Prioritize joining by 'id' ---
|
1292 |
+
id_col_exists_in_review = 'id' in review_file_df.columns
|
1293 |
+
id_col_exists_in_redaction = 'id' in redaction_decision_output.columns
|
1294 |
+
joined_by_id = False # Flag to track if ID join was successful
|
1295 |
+
|
1296 |
+
if id_col_exists_in_review and id_col_exists_in_redaction:
|
1297 |
+
#print("Attempting to join data based on 'id' column.")
|
1298 |
+
try:
|
1299 |
+
# Ensure 'id' columns are of compatible types (e.g., string) to avoid merge errors
|
1300 |
+
review_file_df['id'] = review_file_df['id'].astype(str)
|
1301 |
+
# Make a copy to avoid SettingWithCopyWarning if redaction_decision_output is used elsewhere
|
1302 |
+
redaction_copy = redaction_decision_output.copy()
|
1303 |
+
redaction_copy['id'] = redaction_copy['id'].astype(str)
|
1304 |
+
|
1305 |
+
# Select columns to merge from redaction output.
|
1306 |
+
# Primarily interested in 'text', but keep 'id' for the merge key.
|
1307 |
+
# Add other columns from redaction_copy if needed.
|
1308 |
+
cols_to_merge = ['id']
|
1309 |
+
if 'text' in redaction_copy.columns:
|
1310 |
+
cols_to_merge.append('text')
|
1311 |
+
else:
|
1312 |
+
print("Warning: 'text' column not found in redaction_decision_output. Cannot merge text using 'id'.")
|
1313 |
+
|
1314 |
+
# Perform a left merge to keep all annotations and add matching text
|
1315 |
+
# Suffixes prevent collision if 'text' already exists and we want to compare/choose
|
1316 |
+
original_cols = review_file_df.columns.tolist()
|
1317 |
+
merged_df = pd.merge(
|
1318 |
+
review_file_df,
|
1319 |
+
redaction_copy[cols_to_merge],
|
1320 |
+
on='id',
|
1321 |
+
how='left',
|
1322 |
+
suffixes=('', '_redaction') # Suffix applied to columns from right df if names clash
|
1323 |
+
)
|
1324 |
+
|
1325 |
+
# Update the original 'text' column. Prioritize text from redaction output.
|
1326 |
+
# If redaction output had 'text', a 'text_redaction' column now exists.
|
1327 |
+
if 'text_redaction' in merged_df.columns:
|
1328 |
+
if 'text' not in merged_df.columns: # If review_file_df didn't have text initially
|
1329 |
+
merged_df['text'] = merged_df['text_redaction']
|
1330 |
+
else:
|
1331 |
+
# Use text from redaction where available, otherwise keep original text
|
1332 |
+
merged_df['text'] = merged_df['text_redaction'].combine_first(merged_df['text'])
|
1333 |
+
|
1334 |
+
# Remove the temporary column
|
1335 |
+
merged_df = merged_df.drop(columns=['text_redaction'])
|
1336 |
+
|
1337 |
+
# Ensure final columns match original expectation + potentially new 'text'
|
1338 |
+
final_cols = original_cols
|
1339 |
+
if 'text' not in final_cols and 'text' in merged_df.columns:
|
1340 |
+
final_cols.append('text') # Make sure text column is kept if newly added
|
1341 |
+
# Reorder/select columns if necessary, ensuring 'id' is kept
|
1342 |
+
review_file_df = merged_df[[col for col in final_cols if col in merged_df.columns] + (['id'] if 'id' not in final_cols else [])]
|
1343 |
+
|
1344 |
+
|
1345 |
+
#print("Successfully joined data using 'id'.")
|
1346 |
+
joined_by_id = True
|
1347 |
+
|
1348 |
+
except Exception as e:
|
1349 |
+
print(f"Error during 'id'-based merge: {e}. Falling back to proximity match if enabled.")
|
1350 |
+
# Fall through to proximity match below if an error occurred
|
1351 |
+
|
1352 |
+
# --- Fallback to proximity match ---
|
1353 |
+
if not joined_by_id and do_proximity_match:
|
1354 |
+
if not id_col_exists_in_review or not id_col_exists_in_redaction:
|
1355 |
+
print("Could not join by 'id' (column missing in one or both sources).")
|
1356 |
+
print("Performing proximity match to add text data.")
|
1357 |
+
# Match text to review file using proximity
|
1358 |
+
|
1359 |
+
review_file_df = do_proximity_match_all_pages_for_text(df1=review_file_df.copy(), df2=redaction_decision_output.copy())
|
1360 |
+
elif not joined_by_id and not do_proximity_match:
|
1361 |
+
print("Skipping joining text data (ID join not possible, proximity match disabled).")
|
1362 |
+
# --- End of join logic ---
|
1363 |
+
|
1364 |
+
# 4. Ensure required columns exist, filling with blank if they don't
|
1365 |
+
# Define base required columns, 'id' might or might not be present initially
|
1366 |
+
required_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]
|
1367 |
+
# Add 'id' to required list if it exists in the dataframe at this point
|
1368 |
+
if 'id' in review_file_df.columns:
|
1369 |
+
required_columns.append('id')
|
1370 |
+
|
1371 |
+
for col in required_columns:
|
1372 |
if col not in review_file_df.columns:
|
1373 |
+
# Decide default value based on column type (e.g., '' for text, np.nan for numeric?)
|
1374 |
+
# Using '' for simplicity here.
|
1375 |
review_file_df[col] = ''
|
1376 |
|
1377 |
+
# Select and order the final set of columns
|
1378 |
+
review_file_df = review_file_df[required_columns]
|
|
|
|
|
1379 |
|
1380 |
+
# 5. Final processing and sorting
|
1381 |
# If colours are saved as list, convert to tuple
|
1382 |
+
if 'color' in review_file_df.columns:
|
1383 |
+
review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
1384 |
|
1385 |
+
# Sort the results
|
1386 |
+
sort_columns = ['page', 'ymin', 'xmin', 'label']
|
1387 |
+
# Ensure sort columns exist before sorting
|
1388 |
+
valid_sort_columns = [col for col in sort_columns if col in review_file_df.columns]
|
1389 |
+
if valid_sort_columns:
|
1390 |
+
review_file_df = review_file_df.sort_values(valid_sort_columns)
|
1391 |
|
1392 |
return review_file_df
|
1393 |
|
1394 |
+
def fill_missing_box_ids(data_input: dict) -> dict:
|
1395 |
+
"""
|
1396 |
+
Generates unique alphanumeric IDs for bounding boxes in an input dictionary
|
1397 |
+
where the 'id' is missing, blank, or not a 12-character string.
|
1398 |
+
|
1399 |
+
Args:
|
1400 |
+
data_input (dict): The input dictionary containing 'image' and 'boxes' keys.
|
1401 |
+
'boxes' should be a list of dictionaries, each potentially
|
1402 |
+
with an 'id' key.
|
1403 |
+
|
1404 |
+
Returns:
|
1405 |
+
dict: The input dictionary with missing/invalid box IDs filled.
|
1406 |
+
Note: The function modifies the input dictionary in place.
|
1407 |
+
"""
|
1408 |
+
|
1409 |
+
# --- Input Validation ---
|
1410 |
+
if not isinstance(data_input, dict):
|
1411 |
+
raise TypeError("Input 'data_input' must be a dictionary.")
|
1412 |
+
#if 'boxes' not in data_input or not isinstance(data_input.get('boxes'), list):
|
1413 |
+
# raise ValueError("Input dictionary must contain a 'boxes' key with a list value.")
|
1414 |
+
|
1415 |
+
boxes = data_input#['boxes']
|
1416 |
+
id_length = 12
|
1417 |
+
character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
|
1418 |
+
|
1419 |
+
# --- Get Existing IDs to Ensure Uniqueness ---
|
1420 |
+
# Collect all valid existing IDs first
|
1421 |
+
existing_ids = set()
|
1422 |
+
#for box in boxes:
|
1423 |
+
# Check if 'id' exists, is a string, and is the correct length
|
1424 |
+
box_id = boxes.get('id')
|
1425 |
+
if isinstance(box_id, str) and len(box_id) == id_length:
|
1426 |
+
existing_ids.add(box_id)
|
1427 |
+
|
1428 |
+
# --- Identify and Fill Rows Needing IDs ---
|
1429 |
+
generated_ids_set = set() # Keep track of IDs generated *in this run*
|
1430 |
+
num_filled = 0
|
1431 |
+
|
1432 |
+
#for box in boxes:
|
1433 |
+
box_id = boxes.get('id')
|
1434 |
+
|
1435 |
+
# Check if ID needs to be generated
|
1436 |
+
# Needs ID if: key is missing, value is None, value is not a string,
|
1437 |
+
# value is an empty string after stripping whitespace, or value is a string
|
1438 |
+
# but not of the correct length.
|
1439 |
+
needs_new_id = (
|
1440 |
+
box_id is None or
|
1441 |
+
not isinstance(box_id, str) or
|
1442 |
+
box_id.strip() == "" or
|
1443 |
+
len(box_id) != id_length
|
1444 |
+
)
|
1445 |
+
|
1446 |
+
if needs_new_id:
|
1447 |
+
# Generate a unique ID
|
1448 |
+
attempts = 0
|
1449 |
+
while True:
|
1450 |
+
candidate_id = ''.join(random.choices(character_set, k=id_length))
|
1451 |
+
# Check against *all* existing valid IDs and *newly* generated ones in this run
|
1452 |
+
if candidate_id not in existing_ids and candidate_id not in generated_ids_set:
|
1453 |
+
generated_ids_set.add(candidate_id)
|
1454 |
+
boxes['id'] = candidate_id # Assign the new ID directly to the box dict
|
1455 |
+
num_filled += 1
|
1456 |
+
break # Found a unique ID
|
1457 |
+
attempts += 1
|
1458 |
+
# Safety break for unlikely infinite loop (though highly improbable with 12 chars)
|
1459 |
+
if attempts > len(boxes) * 100 + 1000:
|
1460 |
+
raise RuntimeError(f"Failed to generate a unique ID after {attempts} attempts. Check ID length or existing IDs.")
|
1461 |
+
|
1462 |
+
if num_filled > 0:
|
1463 |
+
pass
|
1464 |
+
#print(f"Successfully filled {num_filled} missing or invalid box IDs.")
|
1465 |
+
else:
|
1466 |
+
pass
|
1467 |
+
#print("No missing or invalid box IDs found.")
|
1468 |
+
|
1469 |
+
|
1470 |
+
# The input dictionary 'data_input' has been modified in place
|
1471 |
+
return data_input
|
1472 |
+
|
1473 |
+
def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
|
1474 |
+
"""
|
1475 |
+
Generates unique alphanumeric IDs for rows in a DataFrame column
|
1476 |
+
where the value is missing (NaN, None) or an empty string.
|
1477 |
+
|
1478 |
+
Args:
|
1479 |
+
df (pd.DataFrame): The input Pandas DataFrame.
|
1480 |
+
column_name (str): The name of the column to check and fill (defaults to 'id').
|
1481 |
+
This column will be added if it doesn't exist.
|
1482 |
+
length (int): The desired length of the generated IDs (defaults to 12).
|
1483 |
+
Cannot exceed the limits that guarantee uniqueness based
|
1484 |
+
on the number of IDs needed and character set size.
|
1485 |
+
|
1486 |
+
Returns:
|
1487 |
+
pd.DataFrame: The DataFrame with missing/empty IDs filled in the specified column.
|
1488 |
+
Note: The function modifies the DataFrame in place.
|
1489 |
+
"""
|
1490 |
+
|
1491 |
+
# --- Input Validation ---
|
1492 |
+
if not isinstance(df, pd.DataFrame):
|
1493 |
+
raise TypeError("Input 'df' must be a Pandas DataFrame.")
|
1494 |
+
if not isinstance(column_name, str) or not column_name:
|
1495 |
+
raise ValueError("'column_name' must be a non-empty string.")
|
1496 |
+
if not isinstance(length, int) or length <= 0:
|
1497 |
+
raise ValueError("'length' must be a positive integer.")
|
1498 |
+
|
1499 |
+
# --- Ensure Column Exists ---
|
1500 |
+
if column_name not in df.columns:
|
1501 |
+
print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
|
1502 |
+
df[column_name] = np.nan # Initialize with NaN
|
1503 |
+
|
1504 |
+
# --- Identify Rows Needing IDs ---
|
1505 |
+
# Check for NaN, None, or empty strings ('')
|
1506 |
+
# Convert to string temporarily for robust empty string check, handle potential errors
|
1507 |
+
try:
|
1508 |
+
df[column_name] = df[column_name].astype(str) #handles NaN/None conversion, .str.strip() removes whitespace
|
1509 |
+
is_missing_or_empty = (
|
1510 |
+
df[column_name].isna()
|
1511 |
+
#| (df[column_name].astype(str).str.strip() == '')
|
1512 |
+
#| (df[column_name] == "nan")
|
1513 |
+
| (df[column_name].astype(str).str.len() != length)
|
1514 |
+
)
|
1515 |
+
except Exception as e:
|
1516 |
+
# Fallback if conversion to string fails (e.g., column contains complex objects)
|
1517 |
+
print(f"Warning: Could not perform reliable empty string check on column '{column_name}' due to data type issues. Checking for NaN/None only. Error: {e}")
|
1518 |
+
is_missing_or_empty = df[column_name].isna()
|
1519 |
+
|
1520 |
+
rows_to_fill_index = df.index[is_missing_or_empty]
|
1521 |
+
num_needed = len(rows_to_fill_index)
|
1522 |
+
|
1523 |
+
if num_needed == 0:
|
1524 |
+
#print(f"No missing or empty values found in column '{column_name}'.")
|
1525 |
+
return df
|
1526 |
+
|
1527 |
+
print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
|
1528 |
+
|
1529 |
+
# --- Get Existing IDs to Ensure Uniqueness ---
|
1530 |
+
try:
|
1531 |
+
# Get all non-missing, non-empty string values from the column
|
1532 |
+
existing_ids = set(df.loc[~is_missing_or_empty, column_name].astype(str))
|
1533 |
+
except Exception as e:
|
1534 |
+
print(f"Warning: Could not reliably get all existing string IDs from column '{column_name}' due to data type issues. Uniqueness check might be less strict. Error: {e}")
|
1535 |
+
# Fallback: Get only non-NaN IDs, potential type issues ignored
|
1536 |
+
existing_ids = set(df.loc[df[column_name].notna(), column_name])
|
1537 |
+
|
1538 |
+
|
1539 |
+
# --- Generate Unique IDs ---
|
1540 |
+
character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
|
1541 |
+
generated_ids_set = set() # Keep track of IDs generated *in this run*
|
1542 |
+
new_ids_list = [] # Store the generated IDs in order
|
1543 |
+
|
1544 |
+
max_possible_ids = len(character_set) ** length
|
1545 |
+
if num_needed > max_possible_ids:
|
1546 |
+
raise ValueError(f"Cannot generate {num_needed} unique IDs with length {length}. Maximum possible is {max_possible_ids}.")
|
1547 |
+
# Add a check for practical limits if needed, e.g., if num_needed is very close to max_possible_ids, generation could be slow.
|
1548 |
+
|
1549 |
+
#print(f"Generating {num_needed} unique IDs of length {length}...")
|
1550 |
+
for i in range(num_needed):
|
1551 |
+
attempts = 0
|
1552 |
+
while True:
|
1553 |
+
candidate_id = ''.join(random.choices(character_set, k=length))
|
1554 |
+
# Check against *all* existing IDs and *newly* generated ones
|
1555 |
+
if candidate_id not in existing_ids and candidate_id not in generated_ids_set:
|
1556 |
+
generated_ids_set.add(candidate_id)
|
1557 |
+
new_ids_list.append(candidate_id)
|
1558 |
+
break # Found a unique ID
|
1559 |
+
attempts += 1
|
1560 |
+
if attempts > num_needed * 100 and attempts > 1000 : # Safety break for unlikely infinite loop
|
1561 |
+
raise RuntimeError(f"Failed to generate a unique ID after {attempts} attempts. Check length and character set or existing IDs.")
|
1562 |
+
|
1563 |
+
# Optional progress update for large numbers
|
1564 |
+
if (i + 1) % 1000 == 0:
|
1565 |
+
print(f"Generated {i+1}/{num_needed} IDs...")
|
1566 |
+
|
1567 |
+
|
1568 |
+
# --- Assign New IDs ---
|
1569 |
+
# Use the previously identified index to assign the new IDs correctly
|
1570 |
+
df.loc[rows_to_fill_index, column_name] = new_ids_list
|
1571 |
+
#print(f"Successfully filled {len(new_ids_list)} missing values in column '{column_name}'.")
|
1572 |
+
|
1573 |
+
# The DataFrame 'df' has been modified in place
|
1574 |
+
return df
|
1575 |
+
|
1576 |
def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
|
1577 |
image_paths:List[Image.Image],
|
1578 |
page_sizes:List[dict]=[]) -> List[dict]:
|
|
|
1590 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
1591 |
|
1592 |
review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df)
|
1593 |
+
|
1594 |
+
review_file_df = fill_missing_ids(review_file_df)
|
1595 |
+
|
1596 |
+
if 'id' not in review_file_df.columns:
|
1597 |
+
review_file_df['id'] = ''
|
1598 |
+
review_file_df['id'] = review_file_df['id'].astype(str)
|
1599 |
|
1600 |
# Keep only necessary columns
|
1601 |
+
review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "id", "text"]].drop_duplicates(subset=["image", "page", "xmin", "ymin", "xmax", "ymax", "label", "id"])
|
1602 |
|
1603 |
# If colours are saved as list, convert to tuple
|
1604 |
review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
|
tools/file_redaction.py
CHANGED
@@ -21,7 +21,7 @@ from collections import defaultdict # For efficient grouping
|
|
21 |
|
22 |
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
|
23 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
24 |
-
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes
|
25 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
26 |
from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
|
27 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
@@ -166,10 +166,10 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
166 |
|
167 |
# Ensure all_pages_decision_process_table is in correct format for downstream processes
|
168 |
if isinstance(all_pages_decision_process_table,list):
|
169 |
-
if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"])
|
170 |
elif isinstance(all_pages_decision_process_table, pd.DataFrame):
|
171 |
if all_pages_decision_process_table.empty:
|
172 |
-
all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"])
|
173 |
|
174 |
# If this is the first time around, set variables to 0/blank
|
175 |
if first_loop_state==True:
|
@@ -211,6 +211,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
211 |
if latest_file_completed >= number_of_files:
|
212 |
|
213 |
print("Completed last file")
|
|
|
214 |
current_loop_page = 0
|
215 |
|
216 |
if isinstance(out_message, list) and out_message:
|
@@ -383,7 +384,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
383 |
|
384 |
progress(0.5, desc="Extracting text and redacting document")
|
385 |
|
386 |
-
all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax",
|
387 |
all_line_level_ocr_results_df = pd.DataFrame()
|
388 |
|
389 |
# Run through file loop, redact each file at a time
|
@@ -502,6 +503,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
502 |
if latest_file_completed != len(file_paths_list):
|
503 |
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
504 |
|
|
|
|
|
505 |
# Save redacted file
|
506 |
if pii_identification_method != no_redaction_option:
|
507 |
if is_pdf(file_path) == False:
|
@@ -512,7 +515,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
512 |
#
|
513 |
else:
|
514 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
515 |
-
print("
|
516 |
pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
|
517 |
|
518 |
out_file_paths.append(out_redacted_pdf_file_path)
|
@@ -522,7 +525,6 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
522 |
else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
523 |
|
524 |
ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
|
525 |
-
|
526 |
all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
|
527 |
|
528 |
all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
|
@@ -539,6 +541,8 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
539 |
|
540 |
annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
|
541 |
|
|
|
|
|
542 |
# Save the gradio_annotation_boxes to a review csv file
|
543 |
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
|
544 |
|
@@ -838,7 +842,10 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
|
|
838 |
if hasattr(annot, 'text') and annot.text:
|
839 |
img_annotation_box["text"] = str(annot.text)
|
840 |
else:
|
841 |
-
img_annotation_box["text"] = ""
|
|
|
|
|
|
|
842 |
|
843 |
return img_annotation_box, rect
|
844 |
|
@@ -953,6 +960,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
953 |
page_annotations = page_annotations["boxes"]
|
954 |
|
955 |
for annot in page_annotations:
|
|
|
|
|
|
|
|
|
956 |
# Check if an Image recogniser result, or a Gradio annotation object
|
957 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
958 |
|
@@ -960,6 +971,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
960 |
|
961 |
# Should already be in correct format if img_annotator_box is an input
|
962 |
if isinstance(annot, dict):
|
|
|
963 |
img_annotation_box = annot
|
964 |
|
965 |
box_coordinates = (img_annotation_box['xmin'], img_annotation_box['ymin'], img_annotation_box['xmax'], img_annotation_box['ymax'])
|
@@ -1004,6 +1016,8 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
|
|
1004 |
|
1005 |
img_annotation_box, rect = convert_pikepdf_annotations_to_result_annotation_box(page, annot, image, convert_pikepdf_to_pymupdf_coords, page_sizes_df, image_dimensions=image_dimensions)
|
1006 |
|
|
|
|
|
1007 |
#print("image_dimensions:", image_dimensions)
|
1008 |
#print("annot:", annot)
|
1009 |
|
@@ -1155,7 +1169,7 @@ def redact_image_pdf(file_path:str,
|
|
1155 |
page_break_return:bool=False,
|
1156 |
annotations_all_pages:List=[],
|
1157 |
all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(),
|
1158 |
-
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"]),
|
1159 |
pymupdf_doc:Document = [],
|
1160 |
pii_identification_method:str="Local",
|
1161 |
comprehend_query_number:int=0,
|
@@ -1490,11 +1504,15 @@ def redact_image_pdf(file_path:str,
|
|
1490 |
'start': result.start,
|
1491 |
'end': result.end,
|
1492 |
'score': result.score,
|
1493 |
-
'page': reported_page_number
|
1494 |
} for result in page_merged_redaction_bboxes])
|
1495 |
|
1496 |
all_pages_decision_process_table_list.append(decision_process_table)
|
1497 |
|
|
|
|
|
|
|
|
|
1498 |
# Convert to DataFrame and add to ongoing logging table
|
1499 |
line_level_ocr_results_df = pd.DataFrame([{
|
1500 |
'page': reported_page_number,
|
@@ -1739,12 +1757,16 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1739 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1740 |
|
1741 |
# Convert the new columns to integers (if needed)
|
1742 |
-
analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
1743 |
|
1744 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
1745 |
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
1746 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
1747 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
|
|
|
|
|
|
|
|
1748 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
1749 |
|
1750 |
return decision_process_table
|
@@ -1786,7 +1808,7 @@ def redact_text_pdf(
|
|
1786 |
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
1787 |
annotations_all_pages: List[dict] = [], # List of annotations across all pages
|
1788 |
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
|
1789 |
-
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax",
|
1790 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1791 |
pii_identification_method: str = "Local",
|
1792 |
comprehend_query_number:int = 0,
|
@@ -1967,7 +1989,7 @@ def redact_text_pdf(
|
|
1967 |
pymupdf_page, page_image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_redaction_annotations_on_page, image_path, redact_whole_page=redact_whole_page, convert_pikepdf_to_pymupdf_coords=True, original_cropbox=original_cropboxes[page_no], page_sizes_df=page_sizes_df)
|
1968 |
|
1969 |
# Create decision process table
|
1970 |
-
page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
|
1971 |
|
1972 |
if not page_decision_process_table.empty:
|
1973 |
all_pages_decision_process_table_list.append(page_decision_process_table)
|
@@ -2035,7 +2057,7 @@ def redact_text_pdf(
|
|
2035 |
|
2036 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
2037 |
|
2038 |
-
# Write
|
2039 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
2040 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
2041 |
|
|
|
21 |
|
22 |
from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
|
23 |
from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
|
24 |
+
from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids
|
25 |
from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
|
26 |
from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
|
27 |
from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
|
|
|
166 |
|
167 |
# Ensure all_pages_decision_process_table is in correct format for downstream processes
|
168 |
if isinstance(all_pages_decision_process_table,list):
|
169 |
+
if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
170 |
elif isinstance(all_pages_decision_process_table, pd.DataFrame):
|
171 |
if all_pages_decision_process_table.empty:
|
172 |
+
all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
173 |
|
174 |
# If this is the first time around, set variables to 0/blank
|
175 |
if first_loop_state==True:
|
|
|
211 |
if latest_file_completed >= number_of_files:
|
212 |
|
213 |
print("Completed last file")
|
214 |
+
progress(0.95, "Completed last file, performing final checks")
|
215 |
current_loop_page = 0
|
216 |
|
217 |
if isinstance(out_message, list) and out_message:
|
|
|
384 |
|
385 |
progress(0.5, desc="Extracting text and redacting document")
|
386 |
|
387 |
+
all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
388 |
all_line_level_ocr_results_df = pd.DataFrame()
|
389 |
|
390 |
# Run through file loop, redact each file at a time
|
|
|
503 |
if latest_file_completed != len(file_paths_list):
|
504 |
print("Completed file number:", str(latest_file_completed), "there are more files to do")
|
505 |
|
506 |
+
progress(0.9, "Saving redacted PDF file")
|
507 |
+
|
508 |
# Save redacted file
|
509 |
if pii_identification_method != no_redaction_option:
|
510 |
if is_pdf(file_path) == False:
|
|
|
515 |
#
|
516 |
else:
|
517 |
out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
|
518 |
+
print("Saving redacted PDF file:", out_redacted_pdf_file_path)
|
519 |
pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
|
520 |
|
521 |
out_file_paths.append(out_redacted_pdf_file_path)
|
|
|
525 |
else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
526 |
|
527 |
ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
|
|
|
528 |
all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
|
529 |
|
530 |
all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
|
|
|
541 |
|
542 |
annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
|
543 |
|
544 |
+
|
545 |
+
|
546 |
# Save the gradio_annotation_boxes to a review csv file
|
547 |
review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
|
548 |
|
|
|
842 |
if hasattr(annot, 'text') and annot.text:
|
843 |
img_annotation_box["text"] = str(annot.text)
|
844 |
else:
|
845 |
+
img_annotation_box["text"] = ""
|
846 |
+
|
847 |
+
# Assign an id
|
848 |
+
img_annotation_box = fill_missing_box_ids(img_annotation_box)
|
849 |
|
850 |
return img_annotation_box, rect
|
851 |
|
|
|
960 |
page_annotations = page_annotations["boxes"]
|
961 |
|
962 |
for annot in page_annotations:
|
963 |
+
|
964 |
+
|
965 |
+
|
966 |
+
|
967 |
# Check if an Image recogniser result, or a Gradio annotation object
|
968 |
if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
|
969 |
|
|
|
971 |
|
972 |
# Should already be in correct format if img_annotator_box is an input
|
973 |
if isinstance(annot, dict):
|
974 |
+
annot = fill_missing_box_ids(annot)
|
975 |
img_annotation_box = annot
|
976 |
|
977 |
box_coordinates = (img_annotation_box['xmin'], img_annotation_box['ymin'], img_annotation_box['xmax'], img_annotation_box['ymax'])
|
|
|
1016 |
|
1017 |
img_annotation_box, rect = convert_pikepdf_annotations_to_result_annotation_box(page, annot, image, convert_pikepdf_to_pymupdf_coords, page_sizes_df, image_dimensions=image_dimensions)
|
1018 |
|
1019 |
+
img_annotation_box = fill_missing_box_ids(img_annotation_box)
|
1020 |
+
|
1021 |
#print("image_dimensions:", image_dimensions)
|
1022 |
#print("annot:", annot)
|
1023 |
|
|
|
1169 |
page_break_return:bool=False,
|
1170 |
annotations_all_pages:List=[],
|
1171 |
all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(),
|
1172 |
+
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
|
1173 |
pymupdf_doc:Document = [],
|
1174 |
pii_identification_method:str="Local",
|
1175 |
comprehend_query_number:int=0,
|
|
|
1504 |
'start': result.start,
|
1505 |
'end': result.end,
|
1506 |
'score': result.score,
|
1507 |
+
'page': reported_page_number
|
1508 |
} for result in page_merged_redaction_bboxes])
|
1509 |
|
1510 |
all_pages_decision_process_table_list.append(decision_process_table)
|
1511 |
|
1512 |
+
decision_process_table = fill_missing_ids(decision_process_table)
|
1513 |
+
#decision_process_table.to_csv("output/decision_process_table_with_ids.csv")
|
1514 |
+
|
1515 |
+
|
1516 |
# Convert to DataFrame and add to ongoing logging table
|
1517 |
line_level_ocr_results_df = pd.DataFrame([{
|
1518 |
'page': reported_page_number,
|
|
|
1757 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1758 |
|
1759 |
# Convert the new columns to integers (if needed)
|
1760 |
+
#analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
1761 |
|
1762 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
1763 |
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
1764 |
analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
|
1765 |
analysed_bounding_boxes_df_new['page'] = page_num + 1
|
1766 |
+
|
1767 |
+
#analysed_bounding_boxes_df_new = fill_missing_ids(analysed_bounding_boxes_df_new)
|
1768 |
+
analysed_bounding_boxes_df_new.to_csv("output/analysed_bounding_boxes_df_new_with_ids.csv")
|
1769 |
+
|
1770 |
decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
|
1771 |
|
1772 |
return decision_process_table
|
|
|
1808 |
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
1809 |
annotations_all_pages: List[dict] = [], # List of annotations across all pages
|
1810 |
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
|
1811 |
+
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
1812 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1813 |
pii_identification_method: str = "Local",
|
1814 |
comprehend_query_number:int = 0,
|
|
|
1989 |
pymupdf_page, page_image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_redaction_annotations_on_page, image_path, redact_whole_page=redact_whole_page, convert_pikepdf_to_pymupdf_coords=True, original_cropbox=original_cropboxes[page_no], page_sizes_df=page_sizes_df)
|
1990 |
|
1991 |
# Create decision process table
|
1992 |
+
page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
|
1993 |
|
1994 |
if not page_decision_process_table.empty:
|
1995 |
all_pages_decision_process_table_list.append(page_decision_process_table)
|
|
|
2057 |
|
2058 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
2059 |
|
2060 |
+
# Write all page outputs
|
2061 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
2062 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
2063 |
|
tools/redaction_review.py
CHANGED
@@ -15,7 +15,7 @@ import pymupdf
|
|
15 |
from PIL import ImageDraw, Image
|
16 |
|
17 |
from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
|
18 |
-
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes
|
19 |
from tools.helper_functions import get_file_name_without_type, detect_file_type
|
20 |
from tools.file_redaction import redact_page_with_pymupdf
|
21 |
|
@@ -99,6 +99,8 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
|
|
99 |
review_dataframe = review_df
|
100 |
|
101 |
try:
|
|
|
|
|
102 |
review_dataframe = convert_annotation_json_to_review_df(page_image_annotator_object, review_df, page_sizes)
|
103 |
|
104 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
|
@@ -114,13 +116,13 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
|
|
114 |
page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
|
115 |
page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
|
116 |
|
117 |
-
recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(
|
118 |
|
119 |
-
recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
|
120 |
|
121 |
except Exception as e:
|
122 |
print("Could not extract recogniser information:", e)
|
123 |
-
recogniser_dataframe_out = recogniser_dataframe_base[["page", "label", "text"]]
|
124 |
|
125 |
label_choices = review_dataframe["label"].astype(str).unique().tolist()
|
126 |
text_choices = review_dataframe["text"].astype(str).unique().tolist()
|
@@ -151,7 +153,7 @@ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData,
|
|
151 |
|
152 |
review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
|
153 |
|
154 |
-
recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(
|
155 |
|
156 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
|
157 |
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
@@ -179,15 +181,32 @@ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
|
|
179 |
'''
|
180 |
out_image_annotations_state = current_image_annotations_state
|
181 |
out_current_page_annotator = current_page_annotator
|
|
|
182 |
|
183 |
if not review_df.empty:
|
184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
185 |
out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
|
186 |
|
187 |
-
|
188 |
|
189 |
-
|
190 |
-
|
|
|
191 |
|
192 |
return out_current_page_annotator, out_image_annotations_state
|
193 |
|
@@ -206,24 +225,30 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
|
|
206 |
backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
|
207 |
|
208 |
if not selected_rows_df.empty and not review_df.empty:
|
209 |
-
|
210 |
-
|
|
|
|
|
|
|
|
|
211 |
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
|
|
|
|
|
|
216 |
out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
|
217 |
|
218 |
out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
|
219 |
|
220 |
-
out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
|
221 |
|
222 |
# Either there is nothing left in the selection dataframe, or the review dataframe
|
223 |
else:
|
224 |
out_review_df = review_df
|
225 |
out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
|
226 |
-
|
227 |
out_image_annotations_state = image_annotations_state
|
228 |
|
229 |
return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
|
@@ -234,7 +259,7 @@ def update_annotator_object_and_filter_df(
|
|
234 |
recogniser_entities_dropdown_value:str="ALL",
|
235 |
page_dropdown_value:str="ALL",
|
236 |
text_dropdown_value:str="ALL",
|
237 |
-
recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, show_search='filter', max_height=400),
|
238 |
zoom:int=100,
|
239 |
review_df:pd.DataFrame=[],
|
240 |
page_sizes:List[dict]=[],
|
@@ -244,6 +269,8 @@ def update_annotator_object_and_filter_df(
|
|
244 |
Update a gradio_image_annotation object with new annotation data.
|
245 |
'''
|
246 |
zoom_str = str(zoom) + '%'
|
|
|
|
|
247 |
|
248 |
if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
|
249 |
|
@@ -295,10 +322,7 @@ def update_annotator_object_and_filter_df(
|
|
295 |
|
296 |
replaced_image_path = current_image_path
|
297 |
|
298 |
-
if review_df.empty: review_df = pd.DataFrame(columns=["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"])
|
299 |
-
|
300 |
-
##
|
301 |
-
|
302 |
review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
|
303 |
|
304 |
# Update dropdowns and review selection dataframe with the updated annotator object
|
@@ -313,19 +337,27 @@ def update_annotator_object_and_filter_df(
|
|
313 |
images_list[page_num_reported_zero_indexed] = replaced_image_path
|
314 |
|
315 |
all_image_annotations[page_num_reported_zero_indexed]['image'] = replaced_image_path
|
316 |
-
|
317 |
# Multiply out image_annotation coordinates from relative to absolute if necessary
|
318 |
all_image_annotations_df = convert_annotation_data_to_dataframe(all_image_annotations)
|
319 |
|
320 |
all_image_annotations_df = multiply_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
321 |
|
|
|
|
|
322 |
all_image_annotations = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
|
323 |
|
|
|
|
|
|
|
|
|
324 |
# Remove blank duplicate entries
|
325 |
all_image_annotations = remove_duplicate_images_with_blank_boxes(all_image_annotations)
|
326 |
|
327 |
current_page_image_annotator_object = all_image_annotations[page_num_reported_zero_indexed]
|
328 |
|
|
|
|
|
329 |
page_number_reported_gradio = gr.Number(label = "Current page", value=page_num_reported, precision=0)
|
330 |
|
331 |
###
|
@@ -537,7 +569,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
537 |
page_sizes_df = pd.DataFrame(page_sizes)
|
538 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
539 |
|
540 |
-
for i in progress.tqdm(range(0, number_of_pages), desc="Saving
|
541 |
|
542 |
image_loc = all_image_annotations[i]['image']
|
543 |
|
@@ -561,7 +593,9 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
561 |
pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1], page_sizes_df= page_sizes_df) # image=image,
|
562 |
else:
|
563 |
print("File type not recognised.")
|
564 |
-
|
|
|
|
|
565 |
#try:
|
566 |
if pdf_doc:
|
567 |
out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
|
@@ -579,7 +613,14 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
|
|
579 |
|
580 |
try:
|
581 |
#print("Saving review file.")
|
582 |
-
review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
584 |
|
585 |
review_df.to_csv(out_review_file_file_path, index=None)
|
@@ -752,8 +793,9 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
752 |
row_value_page = evt.row_value[0] # This is the page number value
|
753 |
row_value_label = evt.row_value[1] # This is the label number value
|
754 |
row_value_text = evt.row_value[2] # This is the text number value
|
|
|
755 |
|
756 |
-
row_value_df = pd.DataFrame(data={"page":[row_value_page], "label":[row_value_label], "text":[row_value_text]})
|
757 |
|
758 |
return row_value_page, row_value_df
|
759 |
|
@@ -787,25 +829,61 @@ def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
|
|
787 |
|
788 |
return row_value_page, row_value_df
|
789 |
|
790 |
-
def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame,
|
791 |
'''
|
792 |
Update the colour of a single redaction box based on the values in a selection row
|
793 |
'''
|
794 |
colour_tuple = str(tuple(colour))
|
795 |
|
796 |
-
if "color" not in review_df.columns: review_df["color"] =
|
|
|
|
|
797 |
|
798 |
# Reset existing highlight colours
|
799 |
-
review_df.loc[review_df["
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
800 |
|
801 |
-
|
802 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
803 |
|
804 |
review_df.drop("_merge", axis=1, inplace=True)
|
805 |
|
806 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
807 |
|
808 |
-
return review_df
|
809 |
|
810 |
def update_boxes_color(images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)):
|
811 |
"""
|
|
|
15 |
from PIL import ImageDraw, Image
|
16 |
|
17 |
from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
|
18 |
+
from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes
|
19 |
from tools.helper_functions import get_file_name_without_type, detect_file_type
|
20 |
from tools.file_redaction import redact_page_with_pymupdf
|
21 |
|
|
|
99 |
review_dataframe = review_df
|
100 |
|
101 |
try:
|
102 |
+
#print("converting annotation json in get_filtered_recogniser...")
|
103 |
+
|
104 |
review_dataframe = convert_annotation_json_to_review_df(page_image_annotator_object, review_df, page_sizes)
|
105 |
|
106 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
|
|
|
116 |
page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
|
117 |
page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
|
118 |
|
119 |
+
recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text", "id"]], show_search="filter", col_count=(4, "fixed"), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
|
120 |
|
121 |
+
recogniser_dataframe_out = review_dataframe[["page", "label", "text", "id"]]
|
122 |
|
123 |
except Exception as e:
|
124 |
print("Could not extract recogniser information:", e)
|
125 |
+
recogniser_dataframe_out = recogniser_dataframe_base[["page", "label", "text", "id"]]
|
126 |
|
127 |
label_choices = review_dataframe["label"].astype(str).unique().tolist()
|
128 |
text_choices = review_dataframe["text"].astype(str).unique().tolist()
|
|
|
153 |
|
154 |
review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
|
155 |
|
156 |
+
recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text", "id"]], show_search="filter", col_count=(4, "fixed"), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
|
157 |
|
158 |
recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
|
159 |
recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
|
|
|
181 |
'''
|
182 |
out_image_annotations_state = current_image_annotations_state
|
183 |
out_current_page_annotator = current_page_annotator
|
184 |
+
gradio_annotator_current_page_number = current_page
|
185 |
|
186 |
if not review_df.empty:
|
187 |
+
#print("review_df just before convert_review_df:", review_df)
|
188 |
+
# First, check that the image on the current page is valid, replace with what exists in page_sizes object if not
|
189 |
+
if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
|
190 |
+
|
191 |
+
# Check bounding values for current page and page max
|
192 |
+
if gradio_annotator_current_page_number > 0: page_num_reported = gradio_annotator_current_page_number
|
193 |
+
elif gradio_annotator_current_page_number == 0: page_num_reported = 1 # minimum possible reported page is 1
|
194 |
+
else:
|
195 |
+
gradio_annotator_current_page_number = 0
|
196 |
+
page_num_reported = 1
|
197 |
+
|
198 |
+
# Ensure page displayed can't exceed number of pages in document
|
199 |
+
page_max_reported = len(out_image_annotations_state)
|
200 |
+
if page_num_reported > page_max_reported: page_num_reported = page_max_reported
|
201 |
+
|
202 |
+
page_num_reported_zero_indexed = page_num_reported - 1
|
203 |
out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
|
204 |
|
205 |
+
page_image_annotator_object, out_image_annotations_state = replace_images_in_image_annotation_object(out_image_annotations_state, out_image_annotations_state[page_num_reported_zero_indexed], page_sizes, page_num_reported)
|
206 |
|
207 |
+
out_image_annotations_state[page_num_reported_zero_indexed] = page_image_annotator_object
|
208 |
+
|
209 |
+
out_current_page_annotator = out_image_annotations_state[page_num_reported_zero_indexed]
|
210 |
|
211 |
return out_current_page_annotator, out_image_annotations_state
|
212 |
|
|
|
225 |
backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
|
226 |
|
227 |
if not selected_rows_df.empty and not review_df.empty:
|
228 |
+
use_id = (
|
229 |
+
"id" in selected_rows_df.columns
|
230 |
+
and "id" in review_df.columns
|
231 |
+
and not selected_rows_df["id"].isnull().all()
|
232 |
+
and not review_df["id"].isnull().all()
|
233 |
+
)
|
234 |
|
235 |
+
selected_merge_cols = ["id"] if use_id else ["label", "page", "text"]
|
236 |
+
|
237 |
+
# Subset and drop duplicates from selected_rows_df
|
238 |
+
selected_subset = selected_rows_df[selected_merge_cols].drop_duplicates(subset=selected_merge_cols)
|
239 |
+
|
240 |
+
# Perform anti-join using merge with indicator
|
241 |
+
merged_df = review_df.merge(selected_subset, on=selected_merge_cols, how='left', indicator=True)
|
242 |
out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
|
243 |
|
244 |
out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
|
245 |
|
246 |
+
out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text", "id"]]
|
247 |
|
248 |
# Either there is nothing left in the selection dataframe, or the review dataframe
|
249 |
else:
|
250 |
out_review_df = review_df
|
251 |
out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
|
|
|
252 |
out_image_annotations_state = image_annotations_state
|
253 |
|
254 |
return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
|
|
|
259 |
recogniser_entities_dropdown_value:str="ALL",
|
260 |
page_dropdown_value:str="ALL",
|
261 |
text_dropdown_value:str="ALL",
|
262 |
+
recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, show_search='filter', max_height=400, static_columns=[0,1,2,3]),
|
263 |
zoom:int=100,
|
264 |
review_df:pd.DataFrame=[],
|
265 |
page_sizes:List[dict]=[],
|
|
|
269 |
Update a gradio_image_annotation object with new annotation data.
|
270 |
'''
|
271 |
zoom_str = str(zoom) + '%'
|
272 |
+
|
273 |
+
#print("all_image_annotations at start of update_annotator_object_and_filter_df[-1]:", all_image_annotations[-1])
|
274 |
|
275 |
if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
|
276 |
|
|
|
322 |
|
323 |
replaced_image_path = current_image_path
|
324 |
|
325 |
+
if review_df.empty: review_df = pd.DataFrame(columns=["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text", "id"])
|
|
|
|
|
|
|
326 |
review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
|
327 |
|
328 |
# Update dropdowns and review selection dataframe with the updated annotator object
|
|
|
337 |
images_list[page_num_reported_zero_indexed] = replaced_image_path
|
338 |
|
339 |
all_image_annotations[page_num_reported_zero_indexed]['image'] = replaced_image_path
|
340 |
+
|
341 |
# Multiply out image_annotation coordinates from relative to absolute if necessary
|
342 |
all_image_annotations_df = convert_annotation_data_to_dataframe(all_image_annotations)
|
343 |
|
344 |
all_image_annotations_df = multiply_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
345 |
|
346 |
+
#print("all_image_annotations_df[-1] just before creating annotation dicts:", all_image_annotations_df.iloc[-1, :])
|
347 |
+
|
348 |
all_image_annotations = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
|
349 |
|
350 |
+
#print("all_image_annotations[-1] after creating annotation dicts:", all_image_annotations[-1])
|
351 |
+
|
352 |
+
|
353 |
+
|
354 |
# Remove blank duplicate entries
|
355 |
all_image_annotations = remove_duplicate_images_with_blank_boxes(all_image_annotations)
|
356 |
|
357 |
current_page_image_annotator_object = all_image_annotations[page_num_reported_zero_indexed]
|
358 |
|
359 |
+
#print("current_page_image_annotator_object that goes into annotator object:", current_page_image_annotator_object)
|
360 |
+
|
361 |
page_number_reported_gradio = gr.Number(label = "Current page", value=page_num_reported, precision=0)
|
362 |
|
363 |
###
|
|
|
569 |
page_sizes_df = pd.DataFrame(page_sizes)
|
570 |
page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
|
571 |
|
572 |
+
for i in progress.tqdm(range(0, number_of_pages), desc="Saving redacted pages to file", unit = "pages"):
|
573 |
|
574 |
image_loc = all_image_annotations[i]['image']
|
575 |
|
|
|
593 |
pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1], page_sizes_df= page_sizes_df) # image=image,
|
594 |
else:
|
595 |
print("File type not recognised.")
|
596 |
+
|
597 |
+
progress(0.9, "Saving output files")
|
598 |
+
|
599 |
#try:
|
600 |
if pdf_doc:
|
601 |
out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
|
|
|
613 |
|
614 |
try:
|
615 |
#print("Saving review file.")
|
616 |
+
review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)
|
617 |
+
|
618 |
+
page_sizes_df = pd.DataFrame(page_sizes)
|
619 |
+
page_sizes_df .loc[:, "page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
|
620 |
+
review_df = divide_coordinates_by_page_sizes(review_df, page_sizes_df)
|
621 |
+
|
622 |
+
review_df = review_df[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text", "id"]]
|
623 |
+
|
624 |
out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
|
625 |
|
626 |
review_df.to_csv(out_review_file_file_path, index=None)
|
|
|
793 |
row_value_page = evt.row_value[0] # This is the page number value
|
794 |
row_value_label = evt.row_value[1] # This is the label number value
|
795 |
row_value_text = evt.row_value[2] # This is the text number value
|
796 |
+
row_value_id = evt.row_value[3] # This is the text number value
|
797 |
|
798 |
+
row_value_df = pd.DataFrame(data={"page":[row_value_page], "label":[row_value_label], "text":[row_value_text], "id":[row_value_id]})
|
799 |
|
800 |
return row_value_page, row_value_df
|
801 |
|
|
|
829 |
|
830 |
return row_value_page, row_value_df
|
831 |
|
832 |
+
def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, previous_id:str="", previous_colour:str='(0, 0, 0)', page_sizes:List[dict]=[], output_folder:str=OUTPUT_FOLDER, colour:str='(1, 0, 255)'):
|
833 |
'''
|
834 |
Update the colour of a single redaction box based on the values in a selection row
|
835 |
'''
|
836 |
colour_tuple = str(tuple(colour))
|
837 |
|
838 |
+
if "color" not in review_df.columns: review_df["color"] = '(0, 0, 0)'
|
839 |
+
if "id" not in review_df.columns:
|
840 |
+
review_df = fill_missing_ids(review_df)
|
841 |
|
842 |
# Reset existing highlight colours
|
843 |
+
review_df.loc[review_df["id"]==previous_id, "color"] = review_df.loc[review_df["id"]==previous_id, "color"].apply(lambda _: previous_colour)
|
844 |
+
review_df.loc[review_df["color"].astype(str)==colour, "color"] = review_df.loc[review_df["color"].astype(str)==colour, "color"].apply(lambda _: '(0, 0, 0)')
|
845 |
+
|
846 |
+
if not redaction_row_selection.empty and not review_df.empty:
|
847 |
+
use_id = (
|
848 |
+
"id" in redaction_row_selection.columns
|
849 |
+
and "id" in review_df.columns
|
850 |
+
and not redaction_row_selection["id"].isnull().all()
|
851 |
+
and not review_df["id"].isnull().all()
|
852 |
+
)
|
853 |
+
|
854 |
+
selected_merge_cols = ["id"] if use_id else ["label", "page", "text"]
|
855 |
+
|
856 |
+
review_df = review_df.merge(redaction_row_selection[selected_merge_cols], on=selected_merge_cols, indicator=True, how="left")
|
857 |
|
858 |
+
if "_merge" in review_df.columns:
|
859 |
+
filtered_reviews = review_df.loc[review_df["_merge"]=="both"]
|
860 |
+
else:
|
861 |
+
filtered_reviews = pd.DataFrame()
|
862 |
+
|
863 |
+
if not filtered_reviews.empty:
|
864 |
+
previous_colour = str(filtered_reviews["color"].values[0])
|
865 |
+
previous_id = filtered_reviews["id"].values[0]
|
866 |
+
review_df.loc[review_df["_merge"]=="both", "color"] = review_df.loc[review_df["_merge"] == "both", "color"].apply(lambda _: colour)
|
867 |
+
else:
|
868 |
+
# Handle the case where no rows match the condition
|
869 |
+
print("No reviews found with _merge == 'both'")
|
870 |
+
previous_colour = '(0, 0, 0)'
|
871 |
+
review_df.loc[review_df["color"]==colour, "color"] = previous_colour
|
872 |
+
previous_id =''
|
873 |
|
874 |
review_df.drop("_merge", axis=1, inplace=True)
|
875 |
|
876 |
+
# Ensure that all output coordinates are in proportional size
|
877 |
+
#page_sizes_df = pd.DataFrame(page_sizes)
|
878 |
+
#page_sizes_df .loc[:, "page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
|
879 |
+
#print("review_df before divide:", review_df)
|
880 |
+
#print("page_sizes_df before divide:", page_sizes_df)
|
881 |
+
#review_df = divide_coordinates_by_page_sizes(review_df, page_sizes_df)
|
882 |
+
#print("review_df after divide:", review_df)
|
883 |
+
|
884 |
+
review_df = review_df[["image", "page", "label", "color", "xmin","ymin", "xmax", "ymax", "text", "id"]]
|
885 |
|
886 |
+
return review_df, previous_id, previous_colour
|
887 |
|
888 |
def update_boxes_color(images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)):
|
889 |
"""
|
tools/textract_batch_call.py
CHANGED
@@ -164,7 +164,7 @@ def analyse_document_with_textract_api(
|
|
164 |
}])
|
165 |
|
166 |
# File path
|
167 |
-
log_file_path = os.path.join(local_output_dir, "
|
168 |
|
169 |
# Check if file exists
|
170 |
file_exists = os.path.exists(log_file_path)
|
@@ -444,18 +444,16 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
|
|
444 |
'''
|
445 |
Load in a dataframe of jobs previous submitted to the Textract API service.
|
446 |
'''
|
447 |
-
|
448 |
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
|
449 |
|
450 |
# Initialize boto3 clients
|
451 |
session = boto3.Session(region_name=aws_region)
|
452 |
s3_client = session.client('s3')
|
453 |
|
454 |
-
local_output_path = f'{load_local_jobs_loc}/
|
455 |
|
456 |
if load_s3_jobs == 'True':
|
457 |
-
|
458 |
-
s3_output_key = f'{load_s3_jobs_loc}/textract_job_log_files.csv'
|
459 |
|
460 |
try:
|
461 |
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
@@ -523,4 +521,10 @@ def download_textract_output(job_id:str,
|
|
523 |
s3_client.download_file(output_bucket, output_file_key, local_file_path)
|
524 |
print(f"Output file downloaded to: {local_file_path}")
|
525 |
except Exception as e:
|
526 |
-
print(f"Error downloading file: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
}])
|
165 |
|
166 |
# File path
|
167 |
+
log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
|
168 |
|
169 |
# Check if file exists
|
170 |
file_exists = os.path.exists(log_file_path)
|
|
|
444 |
'''
|
445 |
Load in a dataframe of jobs previous submitted to the Textract API service.
|
446 |
'''
|
|
|
447 |
job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
|
448 |
|
449 |
# Initialize boto3 clients
|
450 |
session = boto3.Session(region_name=aws_region)
|
451 |
s3_client = session.client('s3')
|
452 |
|
453 |
+
local_output_path = f'{load_local_jobs_loc}/textract_document_jobs.csv'
|
454 |
|
455 |
if load_s3_jobs == 'True':
|
456 |
+
s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
|
|
|
457 |
|
458 |
try:
|
459 |
s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
|
|
|
521 |
s3_client.download_file(output_bucket, output_file_key, local_file_path)
|
522 |
print(f"Output file downloaded to: {local_file_path}")
|
523 |
except Exception as e:
|
524 |
+
print(f"Error downloading file: {e}")
|
525 |
+
|
526 |
+
def check_textract_outputs_exist(textract_output_found_checkbox):
|
527 |
+
if textract_output_found_checkbox == True:
|
528 |
+
print("Textract outputs found")
|
529 |
+
return
|
530 |
+
else: raise Exception("Relevant Tetract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")
|