Sean Pedrick-Case commited on
Commit
4f93b3f
·
unverified ·
2 Parent(s): c27db98 a56b9b0

Merge pull request #16 from seanpedrick-case/dev

Browse files

Added id and text properties to annotation object. Other minor changes.

app.py CHANGED
@@ -15,7 +15,7 @@ from tools.auth import authenticate_user
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  from tools.custom_csvlogger import CSVLogger_custom
17
  from tools.find_duplicate_pages import identify_similar_pages
18
- from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id
19
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
@@ -153,6 +153,8 @@ with app:
153
  s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
154
  s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
155
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
 
 
156
 
157
  load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
158
  s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
@@ -164,7 +166,7 @@ with app:
164
  default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
165
 
166
  # Base tables that are not modified subsequent to load
167
- recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
168
  all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
169
  all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
170
  cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
@@ -203,6 +205,7 @@ with app:
203
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
204
 
205
  textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
 
206
 
207
  ###
208
  # UI DESIGN
@@ -263,8 +266,10 @@ with app:
263
  job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
264
  check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
265
  with gr.Row():
266
- job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
267
- textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
 
 
268
 
269
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
270
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
@@ -298,8 +303,8 @@ with app:
298
  with gr.Column(scale=2):
299
  with gr.Row(equal_height=True):
300
  annotation_last_page_button = gr.Button("Previous page", scale = 4)
301
- annotate_current_page = gr.Number(value=0, label="Current page", precision=0, scale = 2, min_width=50)
302
- annotate_max_pages = gr.Number(value=0, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
303
  annotation_next_page_button = gr.Button("Next page", scale = 4)
304
 
305
  zoom_str = str(annotator_zoom_number) + '%'
@@ -336,7 +341,7 @@ with app:
336
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
337
  page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
338
  text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
339
- recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=(3,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
340
 
341
  with gr.Row(equal_height=True):
342
  exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
@@ -346,7 +351,9 @@ with app:
346
 
347
  undo_last_removal_btn = gr.Button(value="Undo last element removal")
348
 
349
- selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), col_count=3, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True)
 
 
350
 
351
  with gr.Accordion("Search all extracted text", open=True):
352
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
@@ -520,6 +527,13 @@ with app:
520
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
521
 
522
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
 
 
 
 
 
 
 
523
 
524
  ###
525
  # REVIEW PDF REDACTIONS
@@ -546,17 +560,22 @@ with app:
546
 
547
  # Apply page redactions
548
  annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
 
 
 
 
 
549
 
550
  # Review table controls
551
  recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
552
  page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
553
  text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
554
 
555
- recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page, selected_entity_dataframe_row])#.\
556
- #success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state], outputs=[review_file_state]).\
557
- #success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, annotate_current_page, annotate_previous_page, all_image_annotations_state, annotator], outputs=[annotator, all_image_annotations_state])
558
-
559
-
560
  reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
561
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
562
 
@@ -577,9 +596,7 @@ with app:
577
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
578
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
579
 
580
- update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
581
- success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
582
- success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
583
 
584
  # Review OCR text buttom
585
  all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
@@ -717,7 +734,7 @@ if __name__ == "__main__":
717
  if RUN_DIRECT_MODE == "0":
718
 
719
  if os.environ['COGNITO_AUTH'] == "1":
720
- app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
721
  else:
722
  app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
723
 
 
15
  from tools.load_spacy_model_custom_recognisers import custom_entities
16
  from tools.custom_csvlogger import CSVLogger_custom
17
  from tools.find_duplicate_pages import identify_similar_pages
18
+ from tools.textract_batch_call import analyse_document_with_textract_api, poll_bulk_textract_analysis_progress_and_download, load_in_textract_job_details, check_for_provided_job_id, check_textract_outputs_exist
19
 
20
  # Suppress downcasting warnings
21
  pd.set_option('future.no_silent_downcasting', True)
 
153
  s3_bulk_textract_input_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_BULK_ANALYSIS_INPUT_SUBFOLDER, visible=False)
154
  s3_bulk_textract_output_subfolder = gr.Textbox(label = "Default Textract bulk S3 output folder", value=TEXTRACT_BULK_ANALYSIS_OUTPUT_SUBFOLDER, visible=False)
155
  successful_textract_api_call_number = gr.Number(precision=0, value=0, visible=False)
156
+ no_redaction_method_drop = gr.Radio(label = """Placeholder for no redaction method after downloading Textract outputs""", value = no_redaction_option, choices=[no_redaction_option], visible=False)
157
+ textract_only_method_drop = gr.Radio(label="""Placeholder for Textract method after downloading Textract outputs""", value = textract_option, choices=[textract_option], visible=False)
158
 
159
  load_s3_bulk_textract_logs_bool = gr.Textbox(label = "Load Textract logs or not", value=LOAD_PREVIOUS_TEXTRACT_JOBS_S3, visible=False)
160
  s3_bulk_textract_logs_subfolder = gr.Textbox(label = "Default Textract bulk S3 input folder", value=TEXTRACT_JOBS_S3_LOC, visible=False)
 
166
  default_cost_code_textbox = gr.Textbox(label = "Default cost code textbox", value=DEFAULT_COST_CODE, visible=False)
167
 
168
  # Base tables that are not modified subsequent to load
169
+ recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="recogniser_entity_dataframe_base", show_search="filter", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, static_columns=[0,1,2,3])
170
  all_line_level_ocr_results_df_base = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, visible=False)
171
  all_line_level_ocr_results_df_placeholder = gr.Dataframe(visible=False)
172
  cost_code_dataframe_base = gr.Dataframe(value=pd.DataFrame(), row_count = (0, "dynamic"), label="Cost codes", type="pandas", interactive=True, show_fullscreen_button=True, show_copy_button=True, show_search='filter', wrap=True, max_height=200, visible=False)
 
205
  job_output_textbox = gr.Textbox(value="", label="Textract call outputs", visible=False)
206
 
207
  textract_job_output_file = gr.File(label="Textract job output files", height=file_input_height, visible=False)
208
+ convert_textract_outputs_to_ocr_results = gr.Button("Placeholder - Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=False)
209
 
210
  ###
211
  # UI DESIGN
 
266
  job_id_textbox = gr.Textbox(label = "Job ID to check status", value='', visible=True)
267
  check_state_of_textract_api_call_btn = gr.Button("Check status of Textract job and download", variant="secondary", visible=True)
268
  with gr.Row():
269
+ job_current_status = gr.Textbox(value="", label="Analysis job current status", visible=True)
270
+ textract_job_output_file = gr.File(label="Textract job output files", height=100, visible=True)
271
+
272
+ convert_textract_outputs_to_ocr_results = gr.Button("Convert Textract job outputs to OCR results (needs relevant document file uploaded above)", variant="secondary", visible=True)
273
 
274
  gr.Markdown("""If you only want to redact certain pages, or certain entities (e.g. just email addresses, or a custom list of terms), please go to the Redaction Settings tab.""")
275
  document_redact_btn = gr.Button("Extract text and redact document", variant="primary", scale = 4)
 
303
  with gr.Column(scale=2):
304
  with gr.Row(equal_height=True):
305
  annotation_last_page_button = gr.Button("Previous page", scale = 4)
306
+ annotate_current_page = gr.Number(value=1, label="Current page", precision=0, scale = 2, min_width=50)
307
+ annotate_max_pages = gr.Number(value=1, label="Total pages", precision=0, interactive=False, scale = 2, min_width=50)
308
  annotation_next_page_button = gr.Button("Next page", scale = 4)
309
 
310
  zoom_str = str(annotator_zoom_number) + '%'
 
341
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
342
  page_entity_dropdown = gr.Dropdown(label="Page", value="ALL", allow_custom_value=True)
343
  text_entity_dropdown = gr.Dropdown(label="Text", value="ALL", allow_custom_value=True)
344
+ recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=(4,"fixed"), type="pandas", label="Search results. Click to go to page", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
345
 
346
  with gr.Row(equal_height=True):
347
  exclude_selected_row_btn = gr.Button(value="Exclude specific row from redactions")
 
351
 
352
  undo_last_removal_btn = gr.Button(value="Undo last element removal")
353
 
354
+ selected_entity_dataframe_row = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), col_count=4, type="pandas", visible=False, label="selected_entity_dataframe_row", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True)
355
+ selected_entity_id = gr.Textbox(value="", label="selected_entity_id", visible=False)
356
+ selected_entity_colour = gr.Textbox(value="", label="selected_entity_colour", visible=False)
357
 
358
  with gr.Accordion("Search all extracted text", open=True):
359
  all_line_level_ocr_results_df = gr.Dataframe(value=pd.DataFrame(), headers=["page", "text"], col_count=(2, 'fixed'), row_count = (0, "dynamic"), label="All OCR results", visible=True, type="pandas", wrap=True, show_fullscreen_button=True, show_search='filter', show_label=False, show_copy_button=True, max_height=400)
 
527
  success(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox])
528
 
529
  textract_job_detail_df.select(df_select_callback_textract_api, inputs=[textract_output_found_checkbox], outputs=[job_id_textbox, job_type_dropdown, selected_job_id_row])
530
+
531
+
532
+ convert_textract_outputs_to_ocr_results.click(fn=check_for_existing_textract_file, inputs=[doc_file_name_no_extension_textbox, output_folder_textbox], outputs=[textract_output_found_checkbox]).\
533
+ success(fn= check_textract_outputs_exist, inputs=[textract_output_found_checkbox]).\
534
+ success(fn = reset_state_vars, outputs=[all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dataframe, recogniser_entity_dataframe_base, pdf_doc_state, duplication_file_path_outputs_list_state, redaction_output_summary_textbox, is_a_textract_api_call]).\
535
+ success(fn= choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, textract_only_method_drop, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, redaction_output_summary_textbox, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, actual_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_base, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, no_redaction_method_drop, comprehend_query_number, max_fuzzy_spelling_mistakes_num, match_fuzzy_whole_phrase_bool, aws_access_key_textbox, aws_secret_key_textbox, annotate_max_pages, review_file_state, output_folder_textbox, document_cropboxes, page_sizes, textract_output_found_checkbox, only_extract_text_radio, duplication_file_path_outputs_list_state, latest_review_file_path, input_folder_textbox, textract_query_number, latest_ocr_file_path],
536
+ outputs=[redaction_output_summary_textbox, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, actual_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_base, all_decision_process_table_state, comprehend_query_number, output_review_files, annotate_max_pages, annotate_max_pages_bottom, prepared_pdf_state, images_pdf_state, review_file_state, page_sizes, duplication_file_path_outputs_list_state, in_duplicate_pages, latest_review_file_path, textract_query_number, latest_ocr_file_path])
537
 
538
  ###
539
  # REVIEW PDF REDACTIONS
 
560
 
561
  # Apply page redactions
562
  annotation_button_apply.click(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state], scroll_to_output=True)
563
+
564
+ # Save current page redactions
565
+ update_current_page_redactions_btn.click(update_all_page_annotation_object_based_on_previous_page, inputs = [annotator, annotate_current_page, annotate_current_page, all_image_annotations_state, page_sizes], outputs = [all_image_annotations_state, annotate_previous_page, annotate_current_page_bottom]).\
566
+ success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
567
+ success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
568
 
569
  # Review table controls
570
  recogniser_entity_dropdown.select(update_entities_df_recogniser_entities, inputs=[recogniser_entity_dropdown, recogniser_entity_dataframe_base, page_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, text_entity_dropdown, page_entity_dropdown])
571
  page_entity_dropdown.select(update_entities_df_page, inputs=[page_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, text_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, text_entity_dropdown])
572
  text_entity_dropdown.select(update_entities_df_text, inputs=[text_entity_dropdown, recogniser_entity_dataframe_base, recogniser_entity_dropdown, page_entity_dropdown], outputs=[recogniser_entity_dataframe, recogniser_entity_dropdown, page_entity_dropdown])
573
 
574
+ # Clicking on a cell in the recogniser entity dataframe will take you to that page, and also highlight the target redaction box in blue
575
+ recogniser_entity_dataframe.select(df_select_callback, inputs=[recogniser_entity_dataframe], outputs=[annotate_current_page, selected_entity_dataframe_row]).\
576
+ success(update_selected_review_df_row_colour, inputs=[selected_entity_dataframe_row, review_file_state, selected_entity_id, selected_entity_colour, page_sizes], outputs=[review_file_state, selected_entity_id, selected_entity_colour]).\
577
+ success(update_annotator_page_from_review_df, inputs=[review_file_state, images_pdf_state, page_sizes, annotate_current_page, annotate_previous_page, all_image_annotations_state, annotator], outputs=[annotator, all_image_annotations_state])
578
+
579
  reset_dropdowns_btn.click(reset_dropdowns, inputs=[recogniser_entity_dataframe_base], outputs=[recogniser_entity_dropdown, text_entity_dropdown, page_entity_dropdown]).\
580
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state])
581
 
 
596
  success(update_annotator_object_and_filter_df, inputs=[all_image_annotations_state, annotate_current_page, recogniser_entity_dropdown, page_entity_dropdown, text_entity_dropdown, recogniser_entity_dataframe_base, annotator_zoom_number, review_file_state, page_sizes, doc_full_file_name_textbox, input_folder_textbox], outputs = [annotator, annotate_current_page, annotate_current_page_bottom, annotate_previous_page, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base, text_entity_dropdown, page_entity_dropdown, page_sizes, all_image_annotations_state]).\
597
  success(apply_redactions_to_review_df_and_files, inputs=[annotator, doc_full_file_name_textbox, pdf_doc_state, all_image_annotations_state, annotate_current_page, review_file_state, output_folder_textbox, do_not_save_pdf_state, page_sizes], outputs=[pdf_doc_state, all_image_annotations_state, output_review_files, log_files_output, review_file_state])
598
 
599
+
 
 
600
 
601
  # Review OCR text buttom
602
  all_line_level_ocr_results_df.select(df_select_callback_ocr, inputs=[all_line_level_ocr_results_df], outputs=[annotate_current_page, selected_entity_dataframe_row], scroll_to_output=True)
 
734
  if RUN_DIRECT_MODE == "0":
735
 
736
  if os.environ['COGNITO_AUTH'] == "1":
737
+ app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, auth=authenticate_user, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
738
  else:
739
  app.queue(max_size=int(MAX_QUEUE_SIZE), default_concurrency_limit=int(DEFAULT_CONCURRENCY_LIMIT)).launch(show_error=True, inbrowser=True, max_file_size=MAX_FILE_SIZE, server_port=GRADIO_SERVER_PORT, root_path=ROOT_PATH)
740
 
requirements.txt CHANGED
@@ -10,7 +10,6 @@ pandas==2.2.3
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- #en_core_web_sm @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
14
  gradio==5.25.2
15
  boto3==1.37.29
16
  pyarrow==19.0.1
@@ -19,7 +18,7 @@ Faker==36.1.1
19
  python-levenshtein==0.26.1
20
  spaczz==0.6.1
21
  # The following version
22
- https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.1/gradio_image_annotation-0.3.1-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
23
  rapidfuzz==3.12.1
24
  python-dotenv==1.0.1
25
  numpy==1.26.4
 
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
 
13
  gradio==5.25.2
14
  boto3==1.37.29
15
  pyarrow==19.0.1
 
18
  python-levenshtein==0.26.1
19
  spaczz==0.6.1
20
  # The following version
21
+ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.2/gradio_image_annotation-0.3.2-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
  rapidfuzz==3.12.1
23
  python-dotenv==1.0.1
24
  numpy==1.26.4
tools/auth.py CHANGED
@@ -69,5 +69,7 @@ def authenticate_user(username:str, password:str, user_pool_id:str=AWS_USER_POOL
69
  except client.exceptions.UserNotFoundException:
70
  return False
71
  except Exception as e:
72
- print(f"An error occurred: {e}")
 
 
73
  return False
 
69
  except client.exceptions.UserNotFoundException:
70
  return False
71
  except Exception as e:
72
+ out_message = f"An error occurred: {e}"
73
+ print(out_message)
74
+ raise Exception(out_message)
75
  return False
tools/aws_functions.py CHANGED
@@ -42,10 +42,6 @@ def download_file_from_s3(bucket_name:str, key:str, local_file_path_and_name:str
42
  if RUN_AWS_FUNCTIONS == "1":
43
 
44
  try:
45
- print("bucket_name:", bucket_name)
46
- print("key:", key)
47
- print("local_file_path_and_name:", local_file_path_and_name)
48
-
49
  # Ensure the local directory exists
50
  os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
51
 
 
42
  if RUN_AWS_FUNCTIONS == "1":
43
 
44
  try:
 
 
 
 
45
  # Ensure the local directory exists
46
  os.makedirs(os.path.dirname(local_file_path_and_name), exist_ok=True)
47
 
tools/file_conversion.py CHANGED
@@ -19,6 +19,8 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
19
  from pdf2image import convert_from_path
20
  from PIL import Image
21
  from scipy.spatial import cKDTree
 
 
22
 
23
  IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
24
 
@@ -834,10 +836,10 @@ def remove_duplicate_images_with_blank_boxes(data: List[dict]) -> List[dict]:
834
  # Filter items with non-empty boxes
835
  non_empty_boxes = [item for item in items if item.get('boxes')]
836
 
837
- # Remove 'text' elements from boxes
838
- for item in non_empty_boxes:
839
- if 'boxes' in item:
840
- item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
841
 
842
  if non_empty_boxes:
843
  # Keep the first entry with non-empty boxes
@@ -855,13 +857,19 @@ def divide_coordinates_by_page_sizes(review_file_df:pd.DataFrame, page_sizes_df:
855
  review_file_df_out = review_file_df
856
 
857
  if xmin in review_file_df.columns and not review_file_df.empty:
 
 
 
 
858
  review_file_df_orig = review_file_df.copy().loc[(review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) & (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1),:]
 
 
859
 
860
- review_file_df = review_file_df.loc[(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) & (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1),:]
861
 
862
- review_file_df.loc[:, "page"] = pd.to_numeric(review_file_df["page"], errors="coerce")
863
 
864
- review_file_df_div = review_file_df
865
 
866
  if "image_width" not in review_file_df_div.columns and not page_sizes_df.empty:
867
 
@@ -902,6 +910,11 @@ def multiply_coordinates_by_page_sizes(review_file_df: pd.DataFrame, page_sizes_
902
 
903
 
904
  if xmin in review_file_df.columns and not review_file_df.empty:
 
 
 
 
 
905
  # Separate absolute vs relative coordinates
906
  review_file_df_orig = review_file_df.loc[
907
  (review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) &
@@ -1014,6 +1027,12 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
1014
  if not 'text' in df2.columns: df2['text'] = ''
1015
  if not 'text' in df1.columns: df1['text'] = ''
1016
 
 
 
 
 
 
 
1017
  # Create a unique key based on coordinates and label for exact merge
1018
  merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
1019
  df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
@@ -1031,6 +1050,8 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
1031
 
1032
  # Handle missing matches using a proximity-based approach
1033
  # Convert coordinates to numpy arrays for KDTree lookup
 
 
1034
  query_coords = np.array(df1[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=float)
1035
 
1036
  # Check for NaN or infinite values in query_coords and filter them out
@@ -1064,9 +1085,6 @@ def do_proximity_match_all_pages_for_text(df1:pd.DataFrame, df2:pd.DataFrame, th
1064
 
1065
  return merged_df
1066
 
1067
-
1068
-
1069
-
1070
  def _extract_page_number(image_path: Any) -> int:
1071
  """Helper function to safely extract page number."""
1072
  if not isinstance(image_path, str):
@@ -1085,7 +1103,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1085
  '''
1086
  if not all_annotations:
1087
  # Return an empty DataFrame with the expected schema if input is empty
1088
- return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text"])
1089
 
1090
  # 1. Create initial DataFrame from the list of annotations
1091
  # Use list comprehensions with .get() for robustness
@@ -1102,7 +1120,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1102
  # Explode removes rows where the list is empty. We want to keep them
1103
  # as rows with NA values. Replace empty lists with a list containing
1104
  # a single placeholder dictionary.
1105
- placeholder_box = {"xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA, "text": pd.NA}
1106
  df['boxes'] = df['boxes'].apply(lambda x: x if x else [placeholder_box])
1107
 
1108
  # 4. Explode the 'boxes' column. Each item in the list becomes a new row.
@@ -1124,7 +1142,7 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1124
  # prevents this from being necessary.
1125
 
1126
  # 7. Ensure essential columns exist and set column order
1127
- essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text"]
1128
  for col in essential_box_cols:
1129
  if col not in final_df.columns:
1130
  final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
@@ -1140,71 +1158,6 @@ def convert_annotation_data_to_dataframe(all_annotations: List[Dict[str, Any]]):
1140
 
1141
  return final_df
1142
 
1143
-
1144
- # def convert_annotation_data_to_dataframe(all_annotations:List[dict]):
1145
- # '''
1146
- # Convert an annotation list of dictionaries to a dataframe with all boxes on a separate row
1147
- # '''
1148
- # # Flatten the data
1149
- # flattened_annotation_data = []
1150
-
1151
- # for annotation in all_annotations:
1152
- # image_path = annotation["image"]
1153
-
1154
- # if image_path:
1155
- # match = re.search(r'_(\d+)\.png$', image_path)
1156
- # if match:
1157
- # number = match.group(1)
1158
- # reported_number = int(number) + 1
1159
- # else:
1160
- # reported_number = 1
1161
- # else:
1162
- # reported_number = 1
1163
-
1164
- # # Check if 'boxes' is in the annotation, if not, add an empty list
1165
- # if 'boxes' not in annotation:
1166
- # annotation['boxes'] = []
1167
-
1168
- # # If boxes are empty, create a row with blank values for xmin, xmax, ymin, ymax
1169
- # if not annotation["boxes"]:
1170
- # data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA}
1171
- # flattened_annotation_data.append(data_to_add)
1172
- # else:
1173
- # for box in annotation["boxes"]:
1174
- # if 'xmin' not in box:
1175
- # data_to_add = {"image": image_path, "page": reported_number, "xmin": pd.NA, 'xmax': pd.NA, 'ymin': pd.NA, 'ymax': pd.NA}
1176
- # elif 'text' not in box:
1177
- # data_to_add = {"image": image_path, "page": reported_number, **box}
1178
- # else:
1179
- # data_to_add = {"image": image_path, "page": reported_number, "text": box['text'], **box}
1180
- # flattened_annotation_data.append(data_to_add)
1181
-
1182
- # # Convert to a DataFrame
1183
- # review_file_df = pd.DataFrame(flattened_annotation_data)
1184
-
1185
- # return review_file_df
1186
-
1187
- # def create_annotation_dicts_from_annotation_df(all_image_annotations_df:pd.DataFrame, page_sizes:List[dict]):
1188
- # '''
1189
- # From an annotation object as a dataframe, convert back to a list of dictionaries that can be used in the Gradio Image Annotator component
1190
- # '''
1191
- # result = []
1192
-
1193
- # # Ensure that every page has an entry in the resulting list of dicts
1194
- # for image_path in page_sizes:
1195
- # annotation = {}
1196
- # annotation["image"] = image_path["image_path"]
1197
- # annotation["boxes"] = []
1198
-
1199
- # result.append(annotation)
1200
-
1201
- # # Then add in all the filled in data
1202
- # for image, group in all_image_annotations_df.groupby('image'):
1203
- # boxes = group[['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']].to_dict(orient='records')
1204
- # result.append({'image': image, 'boxes': boxes})
1205
-
1206
- # return result
1207
-
1208
  def create_annotation_dicts_from_annotation_df(
1209
  all_image_annotations_df: pd.DataFrame,
1210
  page_sizes: List[Dict[str, Any]]
@@ -1228,9 +1181,12 @@ def create_annotation_dicts_from_annotation_df(
1228
 
1229
  # 2. Define columns to extract for boxes and check availability
1230
  # Make sure these columns actually exist in the DataFrame
1231
- box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
1232
  available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
1233
 
 
 
 
1234
  if not available_cols:
1235
  print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
1236
  return list(image_dict.values()) # Return based on page_sizes only
@@ -1248,7 +1204,6 @@ def create_annotation_dicts_from_annotation_df(
1248
  print("Warning: No valid annotation rows found in DataFrame after dropping NA coordinates.")
1249
  return list(image_dict.values())
1250
 
1251
-
1252
  # Process groups
1253
  try:
1254
  for image_path, group in valid_box_df.groupby('image', observed=True, sort=False):
@@ -1271,122 +1226,353 @@ def create_annotation_dicts_from_annotation_df(
1271
 
1272
  return result
1273
 
1274
- # import pandas as pd
1275
- # from typing import List, Dict, Any
1276
-
1277
- # def create_annotation_dicts_from_annotation_df(
1278
- # all_image_annotations_df: pd.DataFrame,
1279
- # page_sizes: List[Dict[str, Any]]
1280
- # ) -> List[Dict[str, Any]]:
1281
- # '''
1282
- # Convert annotation DataFrame back to list of dicts using Pandas merge.
1283
- # Ensures all images from page_sizes are present without duplicates.
1284
- # '''
1285
- # # 1. Create a DataFrame containing all required image paths from page_sizes
1286
- # if not page_sizes:
1287
- # return []
1288
- # all_image_paths = [item.get("image_path") for item in page_sizes if item.get("image_path")]
1289
- # if not all_image_paths:
1290
- # return []
1291
- # # Use unique paths
1292
- # pages_df = pd.DataFrame({'image': list(set(all_image_paths))})
1293
-
1294
- # # Check if the DataFrame is empty or lacks necessary columns
1295
- # if all_image_annotations_df.empty or 'image' not in all_image_annotations_df.columns:
1296
- # print("Warning: Annotation DataFrame is empty or missing 'image' column.")
1297
- # # Add empty boxes column and return
1298
- # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1299
- # return pages_df.to_dict(orient='records')
1300
-
1301
- # # 2. Define columns to extract and check availability
1302
- # box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label']
1303
- # available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
1304
-
1305
- # if not available_cols:
1306
- # print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
1307
- # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1308
- # return pages_df.to_dict(orient='records')
1309
-
1310
- # # 3. Prepare the annotation data: drop invalid rows and aggregate boxes
1311
- # coord_cols = ['xmin', 'ymin', 'xmax', 'ymax']
1312
- # valid_box_df = all_image_annotations_df.dropna(
1313
- # subset=[col for col in coord_cols if col in available_cols]
1314
- # ).copy() # Use .copy()
1315
-
1316
- # if valid_box_df.empty:
1317
- # print("Warning: No valid annotation rows found after dropping NA coordinates.")
1318
- # pages_df['boxes'] = [[] for _ in range(len(pages_df))]
1319
- # return pages_df.to_dict(orient='records')
1320
-
1321
-
1322
- # # Aggregate boxes into lists of dictionaries per image
1323
- # def aggregate_boxes(group):
1324
- # return group[available_cols].to_dict(orient='records')
1325
-
1326
- # # Group by image and apply the aggregation
1327
- # grouped_boxes = valid_box_df.groupby('image', observed=True, sort=False).apply(aggregate_boxes).reset_index(name='boxes')
1328
-
1329
- # # 4. Perform a left merge: keep all images from pages_df, add boxes where they exist
1330
- # merged_df = pd.merge(pages_df, grouped_boxes, on='image', how='left')
1331
-
1332
- # # 5. Fill NaN in 'boxes' column (for images with no annotations) with empty lists
1333
- # # Ensure the column exists before trying to fillna
1334
- # if 'boxes' in merged_df.columns:
1335
- # # Use apply with a lambda for robust filling of NAs or potential None values
1336
- # merged_df['boxes'] = merged_df['boxes'].apply(lambda x: [] if pd.isna(x) else x)
1337
- # else:
1338
- # # Should not happen with left merge, but handle defensively
1339
- # merged_df['boxes'] = [[] for _ in range(len(merged_df))]
1340
-
1341
-
1342
- # # 6. Convert the final DataFrame to the list of dictionaries format
1343
- # result = merged_df.to_dict(orient='records')
1344
-
1345
- # return result
1346
-
1347
- def convert_annotation_json_to_review_df(all_annotations:List[dict],
1348
- redaction_decision_output:pd.DataFrame=pd.DataFrame(),
1349
- page_sizes:pd.DataFrame=pd.DataFrame(),
1350
- do_proximity_match:bool=True) -> pd.DataFrame:
1351
  '''
1352
- Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (if option selected).
 
 
1353
  '''
1354
-
1355
- review_file_df = convert_annotation_data_to_dataframe(all_annotations)
1356
-
1357
- if page_sizes:
1358
- page_sizes_df = pd.DataFrame(page_sizes)
1359
- page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
1360
-
1361
- review_file_df = divide_coordinates_by_page_sizes(review_file_df, page_sizes_df)
1362
-
1363
- redaction_decision_output = divide_coordinates_by_page_sizes(redaction_decision_output, page_sizes_df)
1364
 
1365
- # Join on additional text data from decision output results if included, if text not already there
1366
- if not redaction_decision_output.empty and not review_file_df.empty and do_proximity_match == True:
 
1367
 
1368
- # Match text to review file to match on text
1369
- review_file_df = do_proximity_match_all_pages_for_text(df1 = review_file_df.copy(), df2 = redaction_decision_output.copy())
1370
-
1371
- # Ensure required columns exist, filling with blank if they don't
1372
- check_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]
1373
 
1374
- for col in check_columns:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1375
  if col not in review_file_df.columns:
 
 
1376
  review_file_df[col] = ''
1377
 
1378
- if not review_file_df.empty:
1379
- review_file_df = review_file_df[check_columns]
1380
- else:
1381
- review_file_df = pd.DataFrame(columns=check_columns)
1382
 
 
1383
  # If colours are saved as list, convert to tuple
1384
- review_file_df.loc[:,"color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
 
1385
 
1386
- review_file_df = review_file_df.sort_values(['page', 'ymin', 'xmin', 'label'])
 
 
 
 
 
1387
 
1388
  return review_file_df
1389
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1390
  def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
1391
  image_paths:List[Image.Image],
1392
  page_sizes:List[dict]=[]) -> List[dict]:
@@ -1404,9 +1590,15 @@ def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
1404
  page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
1405
 
1406
  review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df)
 
 
 
 
 
 
1407
 
1408
  # Keep only necessary columns
1409
- review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax"]].drop_duplicates(subset=["image", "page", "xmin", "ymin", "xmax", "ymax", "label"])
1410
 
1411
  # If colours are saved as list, convert to tuple
1412
  review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
 
19
  from pdf2image import convert_from_path
20
  from PIL import Image
21
  from scipy.spatial import cKDTree
22
+ import random
23
+ import string
24
 
25
  IMAGE_NUM_REGEX = re.compile(r'_(\d+)\.png$')
26
 
 
836
  # Filter items with non-empty boxes
837
  non_empty_boxes = [item for item in items if item.get('boxes')]
838
 
839
+ # Remove 'text' elements from boxes (deprecated)
840
+ #for item in non_empty_boxes:
841
+ # if 'boxes' in item:
842
+ # item['boxes'] = [{k: v for k, v in box.items() if k != 'text'} for box in item['boxes']]
843
 
844
  if non_empty_boxes:
845
  # Keep the first entry with non-empty boxes
 
857
  review_file_df_out = review_file_df
858
 
859
  if xmin in review_file_df.columns and not review_file_df.empty:
860
+ coord_cols = [xmin, xmax, ymin, ymax]
861
+ for col in coord_cols:
862
+ review_file_df.loc[:, col] = pd.to_numeric(review_file_df[col], errors="coerce")
863
+
864
  review_file_df_orig = review_file_df.copy().loc[(review_file_df[xmin] <= 1) & (review_file_df[xmax] <= 1) & (review_file_df[ymin] <= 1) & (review_file_df[ymax] <= 1),:]
865
+
866
+ #print("review_file_df_orig:", review_file_df_orig)
867
 
868
+ review_file_df_div = review_file_df.loc[(review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) & (review_file_df[ymin] > 1) & (review_file_df[ymax] > 1),:]
869
 
870
+ #print("review_file_df_div:", review_file_df_div)
871
 
872
+ review_file_df_div.loc[:, "page"] = pd.to_numeric(review_file_df_div["page"], errors="coerce")
873
 
874
  if "image_width" not in review_file_df_div.columns and not page_sizes_df.empty:
875
 
 
910
 
911
 
912
  if xmin in review_file_df.columns and not review_file_df.empty:
913
+
914
+ coord_cols = [xmin, xmax, ymin, ymax]
915
+ for col in coord_cols:
916
+ review_file_df.loc[:, col] = pd.to_numeric(review_file_df[col], errors="coerce")
917
+
918
  # Separate absolute vs relative coordinates
919
  review_file_df_orig = review_file_df.loc[
920
  (review_file_df[xmin] > 1) & (review_file_df[xmax] > 1) &
 
1027
  if not 'text' in df2.columns: df2['text'] = ''
1028
  if not 'text' in df1.columns: df1['text'] = ''
1029
 
1030
+ for col in ['xmin', 'ymin', 'xmax', 'ymax']:
1031
+ df1[col] = pd.to_numeric(df1[col], errors='coerce')
1032
+
1033
+ for col in ['xmin', 'ymin', 'xmax', 'ymax']:
1034
+ df2[col] = pd.to_numeric(df2[col], errors='coerce')
1035
+
1036
  # Create a unique key based on coordinates and label for exact merge
1037
  merge_keys = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page']
1038
  df1['key'] = df1[merge_keys].astype(str).agg('_'.join, axis=1)
 
1050
 
1051
  # Handle missing matches using a proximity-based approach
1052
  # Convert coordinates to numpy arrays for KDTree lookup
1053
+
1054
+
1055
  query_coords = np.array(df1[['xmin', 'ymin', 'xmax', 'ymax']].values, dtype=float)
1056
 
1057
  # Check for NaN or infinite values in query_coords and filter them out
 
1085
 
1086
  return merged_df
1087
 
 
 
 
1088
  def _extract_page_number(image_path: Any) -> int:
1089
  """Helper function to safely extract page number."""
1090
  if not isinstance(image_path, str):
 
1103
  '''
1104
  if not all_annotations:
1105
  # Return an empty DataFrame with the expected schema if input is empty
1106
+ return pd.DataFrame(columns=["image", "page", "xmin", "xmax", "ymin", "ymax", "text", "id"])
1107
 
1108
  # 1. Create initial DataFrame from the list of annotations
1109
  # Use list comprehensions with .get() for robustness
 
1120
  # Explode removes rows where the list is empty. We want to keep them
1121
  # as rows with NA values. Replace empty lists with a list containing
1122
  # a single placeholder dictionary.
1123
+ placeholder_box = {"xmin": pd.NA, "xmax": pd.NA, "ymin": pd.NA, "ymax": pd.NA, "text": pd.NA, "id": pd.NA}
1124
  df['boxes'] = df['boxes'].apply(lambda x: x if x else [placeholder_box])
1125
 
1126
  # 4. Explode the 'boxes' column. Each item in the list becomes a new row.
 
1142
  # prevents this from being necessary.
1143
 
1144
  # 7. Ensure essential columns exist and set column order
1145
+ essential_box_cols = ["xmin", "xmax", "ymin", "ymax", "text", "id"]
1146
  for col in essential_box_cols:
1147
  if col not in final_df.columns:
1148
  final_df[col] = pd.NA # Add column with NA if it wasn't present in any box
 
1158
 
1159
  return final_df
1160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1161
  def create_annotation_dicts_from_annotation_df(
1162
  all_image_annotations_df: pd.DataFrame,
1163
  page_sizes: List[Dict[str, Any]]
 
1181
 
1182
  # 2. Define columns to extract for boxes and check availability
1183
  # Make sure these columns actually exist in the DataFrame
1184
+ box_cols = ['xmin', 'ymin', 'xmax', 'ymax', 'color', 'label', 'text', 'id']
1185
  available_cols = [col for col in box_cols if col in all_image_annotations_df.columns]
1186
 
1187
+ if 'text' in all_image_annotations_df.columns:
1188
+ all_image_annotations_df.loc[all_image_annotations_df['text'].isnull(), 'text'] = ''
1189
+
1190
  if not available_cols:
1191
  print(f"Warning: None of the expected box columns ({box_cols}) found in DataFrame.")
1192
  return list(image_dict.values()) # Return based on page_sizes only
 
1204
  print("Warning: No valid annotation rows found in DataFrame after dropping NA coordinates.")
1205
  return list(image_dict.values())
1206
 
 
1207
  # Process groups
1208
  try:
1209
  for image_path, group in valid_box_df.groupby('image', observed=True, sort=False):
 
1226
 
1227
  return result
1228
 
1229
+ def convert_annotation_json_to_review_df(all_annotations: List[dict],
1230
+ redaction_decision_output: pd.DataFrame = pd.DataFrame(),
1231
+ page_sizes: List[dict] = [],
1232
+ do_proximity_match: bool = True) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1233
  '''
1234
+ Convert the annotation json data to a dataframe format.
1235
+ Add on any text from the initial review_file dataframe by joining based on 'id' if available
1236
+ in both sources, otherwise falling back to joining on pages/co-ordinates (if option selected).
1237
  '''
 
 
 
 
 
 
 
 
 
 
1238
 
1239
+ # 1. Convert annotations to DataFrame
1240
+ # Ensure convert_annotation_data_to_dataframe populates the 'id' column
1241
+ # if 'id' exists in the dictionaries within all_annotations.
1242
 
1243
+ review_file_df = convert_annotation_data_to_dataframe(all_annotations)
 
 
 
 
1244
 
1245
+ # Only keep rows in review_df where there are coordinates
1246
+ review_file_df.dropna(subset='xmin', axis=0, inplace=True)
1247
+
1248
+ # Exit early if the initial conversion results in an empty DataFrame
1249
+ if review_file_df.empty:
1250
+ # Define standard columns for an empty return DataFrame
1251
+ check_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text", "id"]
1252
+ # Ensure 'id' is included if it might have been expected
1253
+ return pd.DataFrame(columns=[col for col in check_columns if col != 'id' or 'id' in review_file_df.columns])
1254
+
1255
+ # 2. Handle page sizes if provided
1256
+ if not page_sizes:
1257
+ page_sizes_df = pd.DataFrame(page_sizes) # Ensure it's a DataFrame
1258
+ # Safely convert page column to numeric
1259
+ page_sizes_df["page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
1260
+ page_sizes_df.dropna(subset=["page"], inplace=True) # Drop rows where conversion failed
1261
+ page_sizes_df["page"] = page_sizes_df["page"].astype(int) # Convert to int after handling errors/NaNs
1262
+
1263
+
1264
+ # Apply coordinate division if page_sizes_df is not empty after processing
1265
+ if not page_sizes_df.empty:
1266
+ # Ensure 'page' column in review_file_df is numeric for merging
1267
+ if 'page' in review_file_df.columns:
1268
+ review_file_df['page'] = pd.to_numeric(review_file_df['page'], errors='coerce')
1269
+ # Drop rows with invalid pages before division
1270
+ review_file_df.dropna(subset=['page'], inplace=True)
1271
+ review_file_df['page'] = review_file_df['page'].astype(int)
1272
+ review_file_df = divide_coordinates_by_page_sizes(review_file_df, page_sizes_df)
1273
+
1274
+ print("review_file_df after coord divide:", review_file_df)
1275
+
1276
+ # Also apply to redaction_decision_output if it's not empty and has page numbers
1277
+ if not redaction_decision_output.empty and 'page' in redaction_decision_output.columns:
1278
+ redaction_decision_output['page'] = pd.to_numeric(redaction_decision_output['page'], errors='coerce')
1279
+ # Drop rows with invalid pages before division
1280
+ redaction_decision_output.dropna(subset=['page'], inplace=True)
1281
+ redaction_decision_output['page'] = redaction_decision_output['page'].astype(int)
1282
+ redaction_decision_output = divide_coordinates_by_page_sizes(redaction_decision_output, page_sizes_df)
1283
+
1284
+ print("redaction_decision_output after coord divide:", redaction_decision_output)
1285
+ else:
1286
+ print("Warning: Page sizes DataFrame became empty after processing, skipping coordinate division.")
1287
+
1288
+
1289
+ # 3. Join additional data from redaction_decision_output if provided
1290
+ if not redaction_decision_output.empty:
1291
+ # --- NEW LOGIC: Prioritize joining by 'id' ---
1292
+ id_col_exists_in_review = 'id' in review_file_df.columns
1293
+ id_col_exists_in_redaction = 'id' in redaction_decision_output.columns
1294
+ joined_by_id = False # Flag to track if ID join was successful
1295
+
1296
+ if id_col_exists_in_review and id_col_exists_in_redaction:
1297
+ #print("Attempting to join data based on 'id' column.")
1298
+ try:
1299
+ # Ensure 'id' columns are of compatible types (e.g., string) to avoid merge errors
1300
+ review_file_df['id'] = review_file_df['id'].astype(str)
1301
+ # Make a copy to avoid SettingWithCopyWarning if redaction_decision_output is used elsewhere
1302
+ redaction_copy = redaction_decision_output.copy()
1303
+ redaction_copy['id'] = redaction_copy['id'].astype(str)
1304
+
1305
+ # Select columns to merge from redaction output.
1306
+ # Primarily interested in 'text', but keep 'id' for the merge key.
1307
+ # Add other columns from redaction_copy if needed.
1308
+ cols_to_merge = ['id']
1309
+ if 'text' in redaction_copy.columns:
1310
+ cols_to_merge.append('text')
1311
+ else:
1312
+ print("Warning: 'text' column not found in redaction_decision_output. Cannot merge text using 'id'.")
1313
+
1314
+ # Perform a left merge to keep all annotations and add matching text
1315
+ # Suffixes prevent collision if 'text' already exists and we want to compare/choose
1316
+ original_cols = review_file_df.columns.tolist()
1317
+ merged_df = pd.merge(
1318
+ review_file_df,
1319
+ redaction_copy[cols_to_merge],
1320
+ on='id',
1321
+ how='left',
1322
+ suffixes=('', '_redaction') # Suffix applied to columns from right df if names clash
1323
+ )
1324
+
1325
+ # Update the original 'text' column. Prioritize text from redaction output.
1326
+ # If redaction output had 'text', a 'text_redaction' column now exists.
1327
+ if 'text_redaction' in merged_df.columns:
1328
+ if 'text' not in merged_df.columns: # If review_file_df didn't have text initially
1329
+ merged_df['text'] = merged_df['text_redaction']
1330
+ else:
1331
+ # Use text from redaction where available, otherwise keep original text
1332
+ merged_df['text'] = merged_df['text_redaction'].combine_first(merged_df['text'])
1333
+
1334
+ # Remove the temporary column
1335
+ merged_df = merged_df.drop(columns=['text_redaction'])
1336
+
1337
+ # Ensure final columns match original expectation + potentially new 'text'
1338
+ final_cols = original_cols
1339
+ if 'text' not in final_cols and 'text' in merged_df.columns:
1340
+ final_cols.append('text') # Make sure text column is kept if newly added
1341
+ # Reorder/select columns if necessary, ensuring 'id' is kept
1342
+ review_file_df = merged_df[[col for col in final_cols if col in merged_df.columns] + (['id'] if 'id' not in final_cols else [])]
1343
+
1344
+
1345
+ #print("Successfully joined data using 'id'.")
1346
+ joined_by_id = True
1347
+
1348
+ except Exception as e:
1349
+ print(f"Error during 'id'-based merge: {e}. Falling back to proximity match if enabled.")
1350
+ # Fall through to proximity match below if an error occurred
1351
+
1352
+ # --- Fallback to proximity match ---
1353
+ if not joined_by_id and do_proximity_match:
1354
+ if not id_col_exists_in_review or not id_col_exists_in_redaction:
1355
+ print("Could not join by 'id' (column missing in one or both sources).")
1356
+ print("Performing proximity match to add text data.")
1357
+ # Match text to review file using proximity
1358
+
1359
+ review_file_df = do_proximity_match_all_pages_for_text(df1=review_file_df.copy(), df2=redaction_decision_output.copy())
1360
+ elif not joined_by_id and not do_proximity_match:
1361
+ print("Skipping joining text data (ID join not possible, proximity match disabled).")
1362
+ # --- End of join logic ---
1363
+
1364
+ # 4. Ensure required columns exist, filling with blank if they don't
1365
+ # Define base required columns, 'id' might or might not be present initially
1366
+ required_columns = ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]
1367
+ # Add 'id' to required list if it exists in the dataframe at this point
1368
+ if 'id' in review_file_df.columns:
1369
+ required_columns.append('id')
1370
+
1371
+ for col in required_columns:
1372
  if col not in review_file_df.columns:
1373
+ # Decide default value based on column type (e.g., '' for text, np.nan for numeric?)
1374
+ # Using '' for simplicity here.
1375
  review_file_df[col] = ''
1376
 
1377
+ # Select and order the final set of columns
1378
+ review_file_df = review_file_df[required_columns]
 
 
1379
 
1380
+ # 5. Final processing and sorting
1381
  # If colours are saved as list, convert to tuple
1382
+ if 'color' in review_file_df.columns:
1383
+ review_file_df["color"] = review_file_df["color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
1384
 
1385
+ # Sort the results
1386
+ sort_columns = ['page', 'ymin', 'xmin', 'label']
1387
+ # Ensure sort columns exist before sorting
1388
+ valid_sort_columns = [col for col in sort_columns if col in review_file_df.columns]
1389
+ if valid_sort_columns:
1390
+ review_file_df = review_file_df.sort_values(valid_sort_columns)
1391
 
1392
  return review_file_df
1393
 
1394
+ def fill_missing_box_ids(data_input: dict) -> dict:
1395
+ """
1396
+ Generates unique alphanumeric IDs for bounding boxes in an input dictionary
1397
+ where the 'id' is missing, blank, or not a 12-character string.
1398
+
1399
+ Args:
1400
+ data_input (dict): The input dictionary containing 'image' and 'boxes' keys.
1401
+ 'boxes' should be a list of dictionaries, each potentially
1402
+ with an 'id' key.
1403
+
1404
+ Returns:
1405
+ dict: The input dictionary with missing/invalid box IDs filled.
1406
+ Note: The function modifies the input dictionary in place.
1407
+ """
1408
+
1409
+ # --- Input Validation ---
1410
+ if not isinstance(data_input, dict):
1411
+ raise TypeError("Input 'data_input' must be a dictionary.")
1412
+ #if 'boxes' not in data_input or not isinstance(data_input.get('boxes'), list):
1413
+ # raise ValueError("Input dictionary must contain a 'boxes' key with a list value.")
1414
+
1415
+ boxes = data_input#['boxes']
1416
+ id_length = 12
1417
+ character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
1418
+
1419
+ # --- Get Existing IDs to Ensure Uniqueness ---
1420
+ # Collect all valid existing IDs first
1421
+ existing_ids = set()
1422
+ #for box in boxes:
1423
+ # Check if 'id' exists, is a string, and is the correct length
1424
+ box_id = boxes.get('id')
1425
+ if isinstance(box_id, str) and len(box_id) == id_length:
1426
+ existing_ids.add(box_id)
1427
+
1428
+ # --- Identify and Fill Rows Needing IDs ---
1429
+ generated_ids_set = set() # Keep track of IDs generated *in this run*
1430
+ num_filled = 0
1431
+
1432
+ #for box in boxes:
1433
+ box_id = boxes.get('id')
1434
+
1435
+ # Check if ID needs to be generated
1436
+ # Needs ID if: key is missing, value is None, value is not a string,
1437
+ # value is an empty string after stripping whitespace, or value is a string
1438
+ # but not of the correct length.
1439
+ needs_new_id = (
1440
+ box_id is None or
1441
+ not isinstance(box_id, str) or
1442
+ box_id.strip() == "" or
1443
+ len(box_id) != id_length
1444
+ )
1445
+
1446
+ if needs_new_id:
1447
+ # Generate a unique ID
1448
+ attempts = 0
1449
+ while True:
1450
+ candidate_id = ''.join(random.choices(character_set, k=id_length))
1451
+ # Check against *all* existing valid IDs and *newly* generated ones in this run
1452
+ if candidate_id not in existing_ids and candidate_id not in generated_ids_set:
1453
+ generated_ids_set.add(candidate_id)
1454
+ boxes['id'] = candidate_id # Assign the new ID directly to the box dict
1455
+ num_filled += 1
1456
+ break # Found a unique ID
1457
+ attempts += 1
1458
+ # Safety break for unlikely infinite loop (though highly improbable with 12 chars)
1459
+ if attempts > len(boxes) * 100 + 1000:
1460
+ raise RuntimeError(f"Failed to generate a unique ID after {attempts} attempts. Check ID length or existing IDs.")
1461
+
1462
+ if num_filled > 0:
1463
+ pass
1464
+ #print(f"Successfully filled {num_filled} missing or invalid box IDs.")
1465
+ else:
1466
+ pass
1467
+ #print("No missing or invalid box IDs found.")
1468
+
1469
+
1470
+ # The input dictionary 'data_input' has been modified in place
1471
+ return data_input
1472
+
1473
+ def fill_missing_ids(df: pd.DataFrame, column_name: str = 'id', length: int = 12) -> pd.DataFrame:
1474
+ """
1475
+ Generates unique alphanumeric IDs for rows in a DataFrame column
1476
+ where the value is missing (NaN, None) or an empty string.
1477
+
1478
+ Args:
1479
+ df (pd.DataFrame): The input Pandas DataFrame.
1480
+ column_name (str): The name of the column to check and fill (defaults to 'id').
1481
+ This column will be added if it doesn't exist.
1482
+ length (int): The desired length of the generated IDs (defaults to 12).
1483
+ Cannot exceed the limits that guarantee uniqueness based
1484
+ on the number of IDs needed and character set size.
1485
+
1486
+ Returns:
1487
+ pd.DataFrame: The DataFrame with missing/empty IDs filled in the specified column.
1488
+ Note: The function modifies the DataFrame in place.
1489
+ """
1490
+
1491
+ # --- Input Validation ---
1492
+ if not isinstance(df, pd.DataFrame):
1493
+ raise TypeError("Input 'df' must be a Pandas DataFrame.")
1494
+ if not isinstance(column_name, str) or not column_name:
1495
+ raise ValueError("'column_name' must be a non-empty string.")
1496
+ if not isinstance(length, int) or length <= 0:
1497
+ raise ValueError("'length' must be a positive integer.")
1498
+
1499
+ # --- Ensure Column Exists ---
1500
+ if column_name not in df.columns:
1501
+ print(f"Column '{column_name}' not found. Adding it to the DataFrame.")
1502
+ df[column_name] = np.nan # Initialize with NaN
1503
+
1504
+ # --- Identify Rows Needing IDs ---
1505
+ # Check for NaN, None, or empty strings ('')
1506
+ # Convert to string temporarily for robust empty string check, handle potential errors
1507
+ try:
1508
+ df[column_name] = df[column_name].astype(str) #handles NaN/None conversion, .str.strip() removes whitespace
1509
+ is_missing_or_empty = (
1510
+ df[column_name].isna()
1511
+ #| (df[column_name].astype(str).str.strip() == '')
1512
+ #| (df[column_name] == "nan")
1513
+ | (df[column_name].astype(str).str.len() != length)
1514
+ )
1515
+ except Exception as e:
1516
+ # Fallback if conversion to string fails (e.g., column contains complex objects)
1517
+ print(f"Warning: Could not perform reliable empty string check on column '{column_name}' due to data type issues. Checking for NaN/None only. Error: {e}")
1518
+ is_missing_or_empty = df[column_name].isna()
1519
+
1520
+ rows_to_fill_index = df.index[is_missing_or_empty]
1521
+ num_needed = len(rows_to_fill_index)
1522
+
1523
+ if num_needed == 0:
1524
+ #print(f"No missing or empty values found in column '{column_name}'.")
1525
+ return df
1526
+
1527
+ print(f"Found {num_needed} rows requiring a unique ID in column '{column_name}'.")
1528
+
1529
+ # --- Get Existing IDs to Ensure Uniqueness ---
1530
+ try:
1531
+ # Get all non-missing, non-empty string values from the column
1532
+ existing_ids = set(df.loc[~is_missing_or_empty, column_name].astype(str))
1533
+ except Exception as e:
1534
+ print(f"Warning: Could not reliably get all existing string IDs from column '{column_name}' due to data type issues. Uniqueness check might be less strict. Error: {e}")
1535
+ # Fallback: Get only non-NaN IDs, potential type issues ignored
1536
+ existing_ids = set(df.loc[df[column_name].notna(), column_name])
1537
+
1538
+
1539
+ # --- Generate Unique IDs ---
1540
+ character_set = string.ascii_letters + string.digits # a-z, A-Z, 0-9
1541
+ generated_ids_set = set() # Keep track of IDs generated *in this run*
1542
+ new_ids_list = [] # Store the generated IDs in order
1543
+
1544
+ max_possible_ids = len(character_set) ** length
1545
+ if num_needed > max_possible_ids:
1546
+ raise ValueError(f"Cannot generate {num_needed} unique IDs with length {length}. Maximum possible is {max_possible_ids}.")
1547
+ # Add a check for practical limits if needed, e.g., if num_needed is very close to max_possible_ids, generation could be slow.
1548
+
1549
+ #print(f"Generating {num_needed} unique IDs of length {length}...")
1550
+ for i in range(num_needed):
1551
+ attempts = 0
1552
+ while True:
1553
+ candidate_id = ''.join(random.choices(character_set, k=length))
1554
+ # Check against *all* existing IDs and *newly* generated ones
1555
+ if candidate_id not in existing_ids and candidate_id not in generated_ids_set:
1556
+ generated_ids_set.add(candidate_id)
1557
+ new_ids_list.append(candidate_id)
1558
+ break # Found a unique ID
1559
+ attempts += 1
1560
+ if attempts > num_needed * 100 and attempts > 1000 : # Safety break for unlikely infinite loop
1561
+ raise RuntimeError(f"Failed to generate a unique ID after {attempts} attempts. Check length and character set or existing IDs.")
1562
+
1563
+ # Optional progress update for large numbers
1564
+ if (i + 1) % 1000 == 0:
1565
+ print(f"Generated {i+1}/{num_needed} IDs...")
1566
+
1567
+
1568
+ # --- Assign New IDs ---
1569
+ # Use the previously identified index to assign the new IDs correctly
1570
+ df.loc[rows_to_fill_index, column_name] = new_ids_list
1571
+ #print(f"Successfully filled {len(new_ids_list)} missing values in column '{column_name}'.")
1572
+
1573
+ # The DataFrame 'df' has been modified in place
1574
+ return df
1575
+
1576
  def convert_review_df_to_annotation_json(review_file_df:pd.DataFrame,
1577
  image_paths:List[Image.Image],
1578
  page_sizes:List[dict]=[]) -> List[dict]:
 
1590
  page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
1591
 
1592
  review_file_df = multiply_coordinates_by_page_sizes(review_file_df, page_sizes_df)
1593
+
1594
+ review_file_df = fill_missing_ids(review_file_df)
1595
+
1596
+ if 'id' not in review_file_df.columns:
1597
+ review_file_df['id'] = ''
1598
+ review_file_df['id'] = review_file_df['id'].astype(str)
1599
 
1600
  # Keep only necessary columns
1601
+ review_file_df = review_file_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "id", "text"]].drop_duplicates(subset=["image", "page", "xmin", "ymin", "xmax", "ymax", "label", "id"])
1602
 
1603
  # If colours are saved as list, convert to tuple
1604
  review_file_df.loc[:, "color"] = review_file_df.loc[:,"color"].apply(lambda x: tuple(x) if isinstance(x, list) else x)
tools/file_redaction.py CHANGED
@@ -21,7 +21,7 @@ from collections import defaultdict # For efficient grouping
21
 
22
  from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
23
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
24
- from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes
25
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
26
  from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
27
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
@@ -166,10 +166,10 @@ def choose_and_run_redactor(file_paths:List[str],
166
 
167
  # Ensure all_pages_decision_process_table is in correct format for downstream processes
168
  if isinstance(all_pages_decision_process_table,list):
169
- if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"])
170
  elif isinstance(all_pages_decision_process_table, pd.DataFrame):
171
  if all_pages_decision_process_table.empty:
172
- all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"])
173
 
174
  # If this is the first time around, set variables to 0/blank
175
  if first_loop_state==True:
@@ -211,6 +211,7 @@ def choose_and_run_redactor(file_paths:List[str],
211
  if latest_file_completed >= number_of_files:
212
 
213
  print("Completed last file")
 
214
  current_loop_page = 0
215
 
216
  if isinstance(out_message, list) and out_message:
@@ -383,7 +384,7 @@ def choose_and_run_redactor(file_paths:List[str],
383
 
384
  progress(0.5, desc="Extracting text and redacting document")
385
 
386
- all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text"])
387
  all_line_level_ocr_results_df = pd.DataFrame()
388
 
389
  # Run through file loop, redact each file at a time
@@ -502,6 +503,8 @@ def choose_and_run_redactor(file_paths:List[str],
502
  if latest_file_completed != len(file_paths_list):
503
  print("Completed file number:", str(latest_file_completed), "there are more files to do")
504
 
 
 
505
  # Save redacted file
506
  if pii_identification_method != no_redaction_option:
507
  if is_pdf(file_path) == False:
@@ -512,7 +515,7 @@ def choose_and_run_redactor(file_paths:List[str],
512
  #
513
  else:
514
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
515
- print("saving redacted pdf file:", out_redacted_pdf_file_path)
516
  pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
517
 
518
  out_file_paths.append(out_redacted_pdf_file_path)
@@ -522,7 +525,6 @@ def choose_and_run_redactor(file_paths:List[str],
522
  else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
523
 
524
  ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
525
-
526
  all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
527
 
528
  all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
@@ -539,6 +541,8 @@ def choose_and_run_redactor(file_paths:List[str],
539
 
540
  annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
541
 
 
 
542
  # Save the gradio_annotation_boxes to a review csv file
543
  review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
544
 
@@ -838,7 +842,10 @@ def prepare_custom_image_recogniser_result_annotation_box(page:Page, annot:dict,
838
  if hasattr(annot, 'text') and annot.text:
839
  img_annotation_box["text"] = str(annot.text)
840
  else:
841
- img_annotation_box["text"] = ""
 
 
 
842
 
843
  return img_annotation_box, rect
844
 
@@ -953,6 +960,10 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
953
  page_annotations = page_annotations["boxes"]
954
 
955
  for annot in page_annotations:
 
 
 
 
956
  # Check if an Image recogniser result, or a Gradio annotation object
957
  if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
958
 
@@ -960,6 +971,7 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
960
 
961
  # Should already be in correct format if img_annotator_box is an input
962
  if isinstance(annot, dict):
 
963
  img_annotation_box = annot
964
 
965
  box_coordinates = (img_annotation_box['xmin'], img_annotation_box['ymin'], img_annotation_box['xmax'], img_annotation_box['ymax'])
@@ -1004,6 +1016,8 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None,
1004
 
1005
  img_annotation_box, rect = convert_pikepdf_annotations_to_result_annotation_box(page, annot, image, convert_pikepdf_to_pymupdf_coords, page_sizes_df, image_dimensions=image_dimensions)
1006
 
 
 
1007
  #print("image_dimensions:", image_dimensions)
1008
  #print("annot:", annot)
1009
 
@@ -1155,7 +1169,7 @@ def redact_image_pdf(file_path:str,
1155
  page_break_return:bool=False,
1156
  annotations_all_pages:List=[],
1157
  all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(),
1158
- all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score"]),
1159
  pymupdf_doc:Document = [],
1160
  pii_identification_method:str="Local",
1161
  comprehend_query_number:int=0,
@@ -1490,11 +1504,15 @@ def redact_image_pdf(file_path:str,
1490
  'start': result.start,
1491
  'end': result.end,
1492
  'score': result.score,
1493
- 'page': reported_page_number
1494
  } for result in page_merged_redaction_bboxes])
1495
 
1496
  all_pages_decision_process_table_list.append(decision_process_table)
1497
 
 
 
 
 
1498
  # Convert to DataFrame and add to ongoing logging table
1499
  line_level_ocr_results_df = pd.DataFrame([{
1500
  'page': reported_page_number,
@@ -1739,12 +1757,16 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
1739
  analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
1740
 
1741
  # Convert the new columns to integers (if needed)
1742
- analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
1743
 
1744
  analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
1745
  analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
1746
  analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
1747
  analysed_bounding_boxes_df_new['page'] = page_num + 1
 
 
 
 
1748
  decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
1749
 
1750
  return decision_process_table
@@ -1786,7 +1808,7 @@ def redact_text_pdf(
1786
  page_break_return: bool = False, # Flag to indicate if a page break should be returned
1787
  annotations_all_pages: List[dict] = [], # List of annotations across all pages
1788
  all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
1789
- all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text"]), # DataFrame for decision process table
1790
  pymupdf_doc: List = [], # List of PyMuPDF documents
1791
  pii_identification_method: str = "Local",
1792
  comprehend_query_number:int = 0,
@@ -1967,7 +1989,7 @@ def redact_text_pdf(
1967
  pymupdf_page, page_image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_redaction_annotations_on_page, image_path, redact_whole_page=redact_whole_page, convert_pikepdf_to_pymupdf_coords=True, original_cropbox=original_cropboxes[page_no], page_sizes_df=page_sizes_df)
1968
 
1969
  # Create decision process table
1970
- page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
1971
 
1972
  if not page_decision_process_table.empty:
1973
  all_pages_decision_process_table_list.append(page_decision_process_table)
@@ -2035,7 +2057,7 @@ def redact_text_pdf(
2035
 
2036
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2037
 
2038
- # Write decision logs
2039
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2040
  all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
2041
 
 
21
 
22
  from tools.config import OUTPUT_FOLDER, IMAGES_DPI, MAX_IMAGE_PIXELS, RUN_AWS_FUNCTIONS, AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_REGION, PAGE_BREAK_VALUE, MAX_TIME_VALUE, LOAD_TRUNCATED_IMAGES, INPUT_FOLDER
23
  from tools.custom_image_analyser_engine import CustomImageAnalyzerEngine, OCRResult, combine_ocr_results, CustomImageRecognizerResult, run_page_text_redaction, merge_text_bounding_boxes
24
+ from tools.file_conversion import convert_annotation_json_to_review_df, redact_whole_pymupdf_page, redact_single_box, convert_pymupdf_to_image_coords, is_pdf, is_pdf_or_image, prepare_image_or_pdf, divide_coordinates_by_page_sizes, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, divide_coordinates_by_page_sizes, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, fill_missing_box_ids
25
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold, custom_entities, custom_recogniser, custom_word_list_recogniser, CustomWordFuzzyRecognizer
26
  from tools.helper_functions import get_file_name_without_type, clean_unicode_text, tesseract_ocr_option, text_ocr_option, textract_option, local_pii_detector, aws_pii_detector, no_redaction_option
27
  from tools.aws_textract import analyse_page_with_textract, json_to_ocrresult, load_and_convert_textract_json
 
166
 
167
  # Ensure all_pages_decision_process_table is in correct format for downstream processes
168
  if isinstance(all_pages_decision_process_table,list):
169
+ if not all_pages_decision_process_table: all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
170
  elif isinstance(all_pages_decision_process_table, pd.DataFrame):
171
  if all_pages_decision_process_table.empty:
172
+ all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
173
 
174
  # If this is the first time around, set variables to 0/blank
175
  if first_loop_state==True:
 
211
  if latest_file_completed >= number_of_files:
212
 
213
  print("Completed last file")
214
+ progress(0.95, "Completed last file, performing final checks")
215
  current_loop_page = 0
216
 
217
  if isinstance(out_message, list) and out_message:
 
384
 
385
  progress(0.5, desc="Extracting text and redacting document")
386
 
387
+ all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
388
  all_line_level_ocr_results_df = pd.DataFrame()
389
 
390
  # Run through file loop, redact each file at a time
 
503
  if latest_file_completed != len(file_paths_list):
504
  print("Completed file number:", str(latest_file_completed), "there are more files to do")
505
 
506
+ progress(0.9, "Saving redacted PDF file")
507
+
508
  # Save redacted file
509
  if pii_identification_method != no_redaction_option:
510
  if is_pdf(file_path) == False:
 
515
  #
516
  else:
517
  out_redacted_pdf_file_path = output_folder + pdf_file_name_without_ext + "_redacted.pdf"
518
+ print("Saving redacted PDF file:", out_redacted_pdf_file_path)
519
  pymupdf_doc.save(out_redacted_pdf_file_path, garbage=4, deflate=True, clean=True)
520
 
521
  out_file_paths.append(out_redacted_pdf_file_path)
 
525
  else: all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
526
 
527
  ocr_file_path = orig_pdf_file_path + "_ocr_output.csv"
 
528
  all_line_level_ocr_results_df.sort_values(["page", "top", "left"], inplace=True)
529
 
530
  all_line_level_ocr_results_df.to_csv(ocr_file_path, index = None, encoding="utf-8")
 
541
 
542
  annotations_all_pages = remove_duplicate_images_with_blank_boxes(annotations_all_pages)
543
 
544
+
545
+
546
  # Save the gradio_annotation_boxes to a review csv file
547
  review_file_state = convert_annotation_json_to_review_df(annotations_all_pages, all_pages_decision_process_table, page_sizes=page_sizes)
548
 
 
842
  if hasattr(annot, 'text') and annot.text:
843
  img_annotation_box["text"] = str(annot.text)
844
  else:
845
+ img_annotation_box["text"] = ""
846
+
847
+ # Assign an id
848
+ img_annotation_box = fill_missing_box_ids(img_annotation_box)
849
 
850
  return img_annotation_box, rect
851
 
 
960
  page_annotations = page_annotations["boxes"]
961
 
962
  for annot in page_annotations:
963
+
964
+
965
+
966
+
967
  # Check if an Image recogniser result, or a Gradio annotation object
968
  if (isinstance(annot, CustomImageRecognizerResult)) | isinstance(annot, dict):
969
 
 
971
 
972
  # Should already be in correct format if img_annotator_box is an input
973
  if isinstance(annot, dict):
974
+ annot = fill_missing_box_ids(annot)
975
  img_annotation_box = annot
976
 
977
  box_coordinates = (img_annotation_box['xmin'], img_annotation_box['ymin'], img_annotation_box['xmax'], img_annotation_box['ymax'])
 
1016
 
1017
  img_annotation_box, rect = convert_pikepdf_annotations_to_result_annotation_box(page, annot, image, convert_pikepdf_to_pymupdf_coords, page_sizes_df, image_dimensions=image_dimensions)
1018
 
1019
+ img_annotation_box = fill_missing_box_ids(img_annotation_box)
1020
+
1021
  #print("image_dimensions:", image_dimensions)
1022
  #print("annot:", annot)
1023
 
 
1169
  page_break_return:bool=False,
1170
  annotations_all_pages:List=[],
1171
  all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(),
1172
+ all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
1173
  pymupdf_doc:Document = [],
1174
  pii_identification_method:str="Local",
1175
  comprehend_query_number:int=0,
 
1504
  'start': result.start,
1505
  'end': result.end,
1506
  'score': result.score,
1507
+ 'page': reported_page_number
1508
  } for result in page_merged_redaction_bboxes])
1509
 
1510
  all_pages_decision_process_table_list.append(decision_process_table)
1511
 
1512
+ decision_process_table = fill_missing_ids(decision_process_table)
1513
+ #decision_process_table.to_csv("output/decision_process_table_with_ids.csv")
1514
+
1515
+
1516
  # Convert to DataFrame and add to ongoing logging table
1517
  line_level_ocr_results_df = pd.DataFrame([{
1518
  'page': reported_page_number,
 
1757
  analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
1758
 
1759
  # Convert the new columns to integers (if needed)
1760
+ #analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
1761
 
1762
  analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
1763
  analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
1764
  analysed_bounding_boxes_df_new = pd.concat([analysed_bounding_boxes_df_new, analysed_bounding_boxes_df_text], axis = 1)
1765
  analysed_bounding_boxes_df_new['page'] = page_num + 1
1766
+
1767
+ #analysed_bounding_boxes_df_new = fill_missing_ids(analysed_bounding_boxes_df_new)
1768
+ analysed_bounding_boxes_df_new.to_csv("output/analysed_bounding_boxes_df_new_with_ids.csv")
1769
+
1770
  decision_process_table = pd.concat([decision_process_table, analysed_bounding_boxes_df_new], axis = 0).drop('result', axis=1)
1771
 
1772
  return decision_process_table
 
1808
  page_break_return: bool = False, # Flag to indicate if a page break should be returned
1809
  annotations_all_pages: List[dict] = [], # List of annotations across all pages
1810
  all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
1811
+ all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
1812
  pymupdf_doc: List = [], # List of PyMuPDF documents
1813
  pii_identification_method: str = "Local",
1814
  comprehend_query_number:int = 0,
 
1989
  pymupdf_page, page_image_annotations = redact_page_with_pymupdf(pymupdf_page, pikepdf_redaction_annotations_on_page, image_path, redact_whole_page=redact_whole_page, convert_pikepdf_to_pymupdf_coords=True, original_cropbox=original_cropboxes[page_no], page_sizes_df=page_sizes_df)
1990
 
1991
  # Create decision process table
1992
+ page_decision_process_table = create_text_redaction_process_results(page_analyser_results, page_redaction_bounding_boxes, current_loop_page)
1993
 
1994
  if not page_decision_process_table.empty:
1995
  all_pages_decision_process_table_list.append(page_decision_process_table)
 
2057
 
2058
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
2059
 
2060
+ # Write all page outputs
2061
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2062
  all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
2063
 
tools/redaction_review.py CHANGED
@@ -15,7 +15,7 @@ import pymupdf
15
  from PIL import ImageDraw, Image
16
 
17
  from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
18
- from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes
19
  from tools.helper_functions import get_file_name_without_type, detect_file_type
20
  from tools.file_redaction import redact_page_with_pymupdf
21
 
@@ -99,6 +99,8 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
99
  review_dataframe = review_df
100
 
101
  try:
 
 
102
  review_dataframe = convert_annotation_json_to_review_df(page_image_annotator_object, review_df, page_sizes)
103
 
104
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
@@ -114,13 +116,13 @@ def get_filtered_recogniser_dataframe_and_dropdowns(page_image_annotator_object:
114
  page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
115
  page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
116
 
117
- recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
118
 
119
- recogniser_dataframe_out = review_dataframe[["page", "label", "text"]]
120
 
121
  except Exception as e:
122
  print("Could not extract recogniser information:", e)
123
- recogniser_dataframe_out = recogniser_dataframe_base[["page", "label", "text"]]
124
 
125
  label_choices = review_dataframe["label"].astype(str).unique().tolist()
126
  text_choices = review_dataframe["text"].astype(str).unique().tolist()
@@ -151,7 +153,7 @@ def update_recogniser_dataframes(page_image_annotator_object:AnnotatedImageData,
151
 
152
  review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
153
 
154
- recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text"]], show_search="filter", col_count=(3, "fixed"), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, max_height=400)
155
 
156
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
157
  recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
@@ -179,15 +181,32 @@ def update_annotator_page_from_review_df(review_df: pd.DataFrame,
179
  '''
180
  out_image_annotations_state = current_image_annotations_state
181
  out_current_page_annotator = current_page_annotator
 
182
 
183
  if not review_df.empty:
184
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
186
 
187
- print("out_image_annotations_state[current_page-1]:", out_image_annotations_state[current_page-1])
188
 
189
- if previous_page == current_page:
190
- out_current_page_annotator = out_image_annotations_state[current_page-1]
 
191
 
192
  return out_current_page_annotator, out_image_annotations_state
193
 
@@ -206,24 +225,30 @@ def exclude_selected_items_from_redaction(review_df: pd.DataFrame,
206
  backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
207
 
208
  if not selected_rows_df.empty and not review_df.empty:
209
- # Ensure selected_rows_df has the same relevant columns
210
- selected_subset = selected_rows_df[['label', 'page', 'text']].drop_duplicates(subset=['label', 'page', 'text'])
 
 
 
 
211
 
212
- # Perform anti-join using merge with an indicator column
213
- merged_df = review_df.merge(selected_subset, on=['label', 'page', 'text'], how='left', indicator=True)
214
-
215
- # Keep only the rows that do not have a match in selected_rows_df
 
 
 
216
  out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
217
 
218
  out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
219
 
220
- out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text"]]
221
 
222
  # Either there is nothing left in the selection dataframe, or the review dataframe
223
  else:
224
  out_review_df = review_df
225
  out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
226
-
227
  out_image_annotations_state = image_annotations_state
228
 
229
  return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
@@ -234,7 +259,7 @@ def update_annotator_object_and_filter_df(
234
  recogniser_entities_dropdown_value:str="ALL",
235
  page_dropdown_value:str="ALL",
236
  text_dropdown_value:str="ALL",
237
- recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[]}), type="pandas", headers=["page", "label", "text"], show_fullscreen_button=True, wrap=True, show_search='filter', max_height=400),
238
  zoom:int=100,
239
  review_df:pd.DataFrame=[],
240
  page_sizes:List[dict]=[],
@@ -244,6 +269,8 @@ def update_annotator_object_and_filter_df(
244
  Update a gradio_image_annotation object with new annotation data.
245
  '''
246
  zoom_str = str(zoom) + '%'
 
 
247
 
248
  if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
249
 
@@ -295,10 +322,7 @@ def update_annotator_object_and_filter_df(
295
 
296
  replaced_image_path = current_image_path
297
 
298
- if review_df.empty: review_df = pd.DataFrame(columns=["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"])
299
-
300
- ##
301
-
302
  review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
303
 
304
  # Update dropdowns and review selection dataframe with the updated annotator object
@@ -313,19 +337,27 @@ def update_annotator_object_and_filter_df(
313
  images_list[page_num_reported_zero_indexed] = replaced_image_path
314
 
315
  all_image_annotations[page_num_reported_zero_indexed]['image'] = replaced_image_path
316
-
317
  # Multiply out image_annotation coordinates from relative to absolute if necessary
318
  all_image_annotations_df = convert_annotation_data_to_dataframe(all_image_annotations)
319
 
320
  all_image_annotations_df = multiply_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
321
 
 
 
322
  all_image_annotations = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
323
 
 
 
 
 
324
  # Remove blank duplicate entries
325
  all_image_annotations = remove_duplicate_images_with_blank_boxes(all_image_annotations)
326
 
327
  current_page_image_annotator_object = all_image_annotations[page_num_reported_zero_indexed]
328
 
 
 
329
  page_number_reported_gradio = gr.Number(label = "Current page", value=page_num_reported, precision=0)
330
 
331
  ###
@@ -537,7 +569,7 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
537
  page_sizes_df = pd.DataFrame(page_sizes)
538
  page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
539
 
540
- for i in progress.tqdm(range(0, number_of_pages), desc="Saving redactions to file", unit = "pages"):
541
 
542
  image_loc = all_image_annotations[i]['image']
543
 
@@ -561,7 +593,9 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
561
  pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1], page_sizes_df= page_sizes_df) # image=image,
562
  else:
563
  print("File type not recognised.")
564
-
 
 
565
  #try:
566
  if pdf_doc:
567
  out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
@@ -579,7 +613,14 @@ def apply_redactions_to_review_df_and_files(page_image_annotator_object:Annotate
579
 
580
  try:
581
  #print("Saving review file.")
582
- review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text"]]#.drop_duplicates(subset=["image", "page", "text", "label","color", "xmin", "ymin", "xmax", "ymax"])
 
 
 
 
 
 
 
583
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
584
 
585
  review_df.to_csv(out_review_file_file_path, index=None)
@@ -752,8 +793,9 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
752
  row_value_page = evt.row_value[0] # This is the page number value
753
  row_value_label = evt.row_value[1] # This is the label number value
754
  row_value_text = evt.row_value[2] # This is the text number value
 
755
 
756
- row_value_df = pd.DataFrame(data={"page":[row_value_page], "label":[row_value_label], "text":[row_value_text]})
757
 
758
  return row_value_page, row_value_df
759
 
@@ -787,25 +829,61 @@ def df_select_callback_ocr(df: pd.DataFrame, evt: gr.SelectData):
787
 
788
  return row_value_page, row_value_df
789
 
790
- def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, colour:tuple=(0,0,255)):
791
  '''
792
  Update the colour of a single redaction box based on the values in a selection row
793
  '''
794
  colour_tuple = str(tuple(colour))
795
 
796
- if "color" not in review_df.columns: review_df["color"] = None
 
 
797
 
798
  # Reset existing highlight colours
799
- review_df.loc[review_df["color"]==colour_tuple, "color"] = review_df.loc[review_df["color"]==colour_tuple, "color"].apply(lambda _: '(0, 0, 0)')
 
 
 
 
 
 
 
 
 
 
 
 
 
800
 
801
- review_df = review_df.merge(redaction_row_selection, on=["page", "label", "text"], indicator=True, how="left")
802
- review_df.loc[review_df["_merge"]=="both", "color"] = review_df.loc[review_df["_merge"] == "both", "color"].apply(lambda _: '(0, 0, 255)')
 
 
 
 
 
 
 
 
 
 
 
 
 
803
 
804
  review_df.drop("_merge", axis=1, inplace=True)
805
 
806
- review_df.to_csv(OUTPUT_FOLDER + "review_df_in_update_selected_review.csv")
 
 
 
 
 
 
 
 
807
 
808
- return review_df
809
 
810
  def update_boxes_color(images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)):
811
  """
 
15
  from PIL import ImageDraw, Image
16
 
17
  from tools.config import OUTPUT_FOLDER, CUSTOM_BOX_COLOUR, MAX_IMAGE_PIXELS, INPUT_FOLDER
18
+ from tools.file_conversion import is_pdf, convert_annotation_json_to_review_df, convert_review_df_to_annotation_json, process_single_page_for_image_conversion, multiply_coordinates_by_page_sizes, convert_annotation_data_to_dataframe, create_annotation_dicts_from_annotation_df, remove_duplicate_images_with_blank_boxes, fill_missing_ids, divide_coordinates_by_page_sizes
19
  from tools.helper_functions import get_file_name_without_type, detect_file_type
20
  from tools.file_redaction import redact_page_with_pymupdf
21
 
 
99
  review_dataframe = review_df
100
 
101
  try:
102
+ #print("converting annotation json in get_filtered_recogniser...")
103
+
104
  review_dataframe = convert_annotation_json_to_review_df(page_image_annotator_object, review_df, page_sizes)
105
 
106
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "label")
 
116
  page_entities_for_drop = update_dropdown_list_based_on_dataframe(review_dataframe, "page")
117
  page_entities_drop = gr.Dropdown(value=page_dropdown_value, choices=page_entities_for_drop, allow_custom_value=True, interactive=True)
118
 
119
+ recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text", "id"]], show_search="filter", col_count=(4, "fixed"), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
120
 
121
+ recogniser_dataframe_out = review_dataframe[["page", "label", "text", "id"]]
122
 
123
  except Exception as e:
124
  print("Could not extract recogniser information:", e)
125
+ recogniser_dataframe_out = recogniser_dataframe_base[["page", "label", "text", "id"]]
126
 
127
  label_choices = review_dataframe["label"].astype(str).unique().tolist()
128
  text_choices = review_dataframe["text"].astype(str).unique().tolist()
 
153
 
154
  review_dataframe, text_entities_drop, page_entities_drop = update_entities_df_recogniser_entities(recogniser_entities_dropdown_value, recogniser_dataframe_out, page_dropdown_value, text_dropdown_value)
155
 
156
+ recogniser_dataframe_out_gr = gr.Dataframe(review_dataframe[["page", "label", "text", "id"]], show_search="filter", col_count=(4, "fixed"), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, max_height=400, static_columns=[0,1,2,3])
157
 
158
  recogniser_entities_for_drop = update_dropdown_list_based_on_dataframe(recogniser_dataframe_out, "label")
159
  recogniser_entities_drop = gr.Dropdown(value=recogniser_entities_dropdown_value, choices=recogniser_entities_for_drop, allow_custom_value=True, interactive=True)
 
181
  '''
182
  out_image_annotations_state = current_image_annotations_state
183
  out_current_page_annotator = current_page_annotator
184
+ gradio_annotator_current_page_number = current_page
185
 
186
  if not review_df.empty:
187
+ #print("review_df just before convert_review_df:", review_df)
188
+ # First, check that the image on the current page is valid, replace with what exists in page_sizes object if not
189
+ if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
190
+
191
+ # Check bounding values for current page and page max
192
+ if gradio_annotator_current_page_number > 0: page_num_reported = gradio_annotator_current_page_number
193
+ elif gradio_annotator_current_page_number == 0: page_num_reported = 1 # minimum possible reported page is 1
194
+ else:
195
+ gradio_annotator_current_page_number = 0
196
+ page_num_reported = 1
197
+
198
+ # Ensure page displayed can't exceed number of pages in document
199
+ page_max_reported = len(out_image_annotations_state)
200
+ if page_num_reported > page_max_reported: page_num_reported = page_max_reported
201
+
202
+ page_num_reported_zero_indexed = page_num_reported - 1
203
  out_image_annotations_state = convert_review_df_to_annotation_json(review_df, image_file_paths, page_sizes)
204
 
205
+ page_image_annotator_object, out_image_annotations_state = replace_images_in_image_annotation_object(out_image_annotations_state, out_image_annotations_state[page_num_reported_zero_indexed], page_sizes, page_num_reported)
206
 
207
+ out_image_annotations_state[page_num_reported_zero_indexed] = page_image_annotator_object
208
+
209
+ out_current_page_annotator = out_image_annotations_state[page_num_reported_zero_indexed]
210
 
211
  return out_current_page_annotator, out_image_annotations_state
212
 
 
225
  backup_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
226
 
227
  if not selected_rows_df.empty and not review_df.empty:
228
+ use_id = (
229
+ "id" in selected_rows_df.columns
230
+ and "id" in review_df.columns
231
+ and not selected_rows_df["id"].isnull().all()
232
+ and not review_df["id"].isnull().all()
233
+ )
234
 
235
+ selected_merge_cols = ["id"] if use_id else ["label", "page", "text"]
236
+
237
+ # Subset and drop duplicates from selected_rows_df
238
+ selected_subset = selected_rows_df[selected_merge_cols].drop_duplicates(subset=selected_merge_cols)
239
+
240
+ # Perform anti-join using merge with indicator
241
+ merged_df = review_df.merge(selected_subset, on=selected_merge_cols, how='left', indicator=True)
242
  out_review_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
243
 
244
  out_image_annotations_state = convert_review_df_to_annotation_json(out_review_df, image_file_paths, page_sizes)
245
 
246
+ out_recogniser_entity_dataframe_base = out_review_df[["page", "label", "text", "id"]]
247
 
248
  # Either there is nothing left in the selection dataframe, or the review dataframe
249
  else:
250
  out_review_df = review_df
251
  out_recogniser_entity_dataframe_base = recogniser_entity_dataframe_base
 
252
  out_image_annotations_state = image_annotations_state
253
 
254
  return out_review_df, out_image_annotations_state, out_recogniser_entity_dataframe_base, backup_review_state, backup_image_annotations_state, backup_recogniser_entity_dataframe_base
 
259
  recogniser_entities_dropdown_value:str="ALL",
260
  page_dropdown_value:str="ALL",
261
  text_dropdown_value:str="ALL",
262
+ recogniser_dataframe_base:gr.Dataframe=gr.Dataframe(pd.DataFrame(data={"page":[], "label":[], "text":[], "id":[]}), type="pandas", headers=["page", "label", "text", "id"], show_fullscreen_button=True, wrap=True, show_search='filter', max_height=400, static_columns=[0,1,2,3]),
263
  zoom:int=100,
264
  review_df:pd.DataFrame=[],
265
  page_sizes:List[dict]=[],
 
269
  Update a gradio_image_annotation object with new annotation data.
270
  '''
271
  zoom_str = str(zoom) + '%'
272
+
273
+ #print("all_image_annotations at start of update_annotator_object_and_filter_df[-1]:", all_image_annotations[-1])
274
 
275
  if not gradio_annotator_current_page_number: gradio_annotator_current_page_number = 0
276
 
 
322
 
323
  replaced_image_path = current_image_path
324
 
325
+ if review_df.empty: review_df = pd.DataFrame(columns=["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text", "id"])
 
 
 
326
  review_df.loc[review_df["page"]==page_num_reported, 'image'] = replaced_image_path
327
 
328
  # Update dropdowns and review selection dataframe with the updated annotator object
 
337
  images_list[page_num_reported_zero_indexed] = replaced_image_path
338
 
339
  all_image_annotations[page_num_reported_zero_indexed]['image'] = replaced_image_path
340
+
341
  # Multiply out image_annotation coordinates from relative to absolute if necessary
342
  all_image_annotations_df = convert_annotation_data_to_dataframe(all_image_annotations)
343
 
344
  all_image_annotations_df = multiply_coordinates_by_page_sizes(all_image_annotations_df, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
345
 
346
+ #print("all_image_annotations_df[-1] just before creating annotation dicts:", all_image_annotations_df.iloc[-1, :])
347
+
348
  all_image_annotations = create_annotation_dicts_from_annotation_df(all_image_annotations_df, page_sizes)
349
 
350
+ #print("all_image_annotations[-1] after creating annotation dicts:", all_image_annotations[-1])
351
+
352
+
353
+
354
  # Remove blank duplicate entries
355
  all_image_annotations = remove_duplicate_images_with_blank_boxes(all_image_annotations)
356
 
357
  current_page_image_annotator_object = all_image_annotations[page_num_reported_zero_indexed]
358
 
359
+ #print("current_page_image_annotator_object that goes into annotator object:", current_page_image_annotator_object)
360
+
361
  page_number_reported_gradio = gr.Number(label = "Current page", value=page_num_reported, precision=0)
362
 
363
  ###
 
569
  page_sizes_df = pd.DataFrame(page_sizes)
570
  page_sizes_df[["page"]] = page_sizes_df[["page"]].apply(pd.to_numeric, errors="coerce")
571
 
572
+ for i in progress.tqdm(range(0, number_of_pages), desc="Saving redacted pages to file", unit = "pages"):
573
 
574
  image_loc = all_image_annotations[i]['image']
575
 
 
593
  pymupdf_page = redact_page_with_pymupdf(page=pymupdf_page, page_annotations=all_image_annotations[i], image=image, original_cropbox=original_cropboxes[-1], page_sizes_df= page_sizes_df) # image=image,
594
  else:
595
  print("File type not recognised.")
596
+
597
+ progress(0.9, "Saving output files")
598
+
599
  #try:
600
  if pdf_doc:
601
  out_pdf_file_path = output_folder + file_name_without_ext + "_redacted.pdf"
 
613
 
614
  try:
615
  #print("Saving review file.")
616
+ review_df = convert_annotation_json_to_review_df(all_image_annotations, review_file_state.copy(), page_sizes=page_sizes)
617
+
618
+ page_sizes_df = pd.DataFrame(page_sizes)
619
+ page_sizes_df .loc[:, "page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
620
+ review_df = divide_coordinates_by_page_sizes(review_df, page_sizes_df)
621
+
622
+ review_df = review_df[["image", "page", "label","color", "xmin", "ymin", "xmax", "ymax", "text", "id"]]
623
+
624
  out_review_file_file_path = output_folder + file_name_with_ext + '_review_file.csv'
625
 
626
  review_df.to_csv(out_review_file_file_path, index=None)
 
793
  row_value_page = evt.row_value[0] # This is the page number value
794
  row_value_label = evt.row_value[1] # This is the label number value
795
  row_value_text = evt.row_value[2] # This is the text number value
796
+ row_value_id = evt.row_value[3] # This is the text number value
797
 
798
+ row_value_df = pd.DataFrame(data={"page":[row_value_page], "label":[row_value_label], "text":[row_value_text], "id":[row_value_id]})
799
 
800
  return row_value_page, row_value_df
801
 
 
829
 
830
  return row_value_page, row_value_df
831
 
832
+ def update_selected_review_df_row_colour(redaction_row_selection:pd.DataFrame, review_df:pd.DataFrame, previous_id:str="", previous_colour:str='(0, 0, 0)', page_sizes:List[dict]=[], output_folder:str=OUTPUT_FOLDER, colour:str='(1, 0, 255)'):
833
  '''
834
  Update the colour of a single redaction box based on the values in a selection row
835
  '''
836
  colour_tuple = str(tuple(colour))
837
 
838
+ if "color" not in review_df.columns: review_df["color"] = '(0, 0, 0)'
839
+ if "id" not in review_df.columns:
840
+ review_df = fill_missing_ids(review_df)
841
 
842
  # Reset existing highlight colours
843
+ review_df.loc[review_df["id"]==previous_id, "color"] = review_df.loc[review_df["id"]==previous_id, "color"].apply(lambda _: previous_colour)
844
+ review_df.loc[review_df["color"].astype(str)==colour, "color"] = review_df.loc[review_df["color"].astype(str)==colour, "color"].apply(lambda _: '(0, 0, 0)')
845
+
846
+ if not redaction_row_selection.empty and not review_df.empty:
847
+ use_id = (
848
+ "id" in redaction_row_selection.columns
849
+ and "id" in review_df.columns
850
+ and not redaction_row_selection["id"].isnull().all()
851
+ and not review_df["id"].isnull().all()
852
+ )
853
+
854
+ selected_merge_cols = ["id"] if use_id else ["label", "page", "text"]
855
+
856
+ review_df = review_df.merge(redaction_row_selection[selected_merge_cols], on=selected_merge_cols, indicator=True, how="left")
857
 
858
+ if "_merge" in review_df.columns:
859
+ filtered_reviews = review_df.loc[review_df["_merge"]=="both"]
860
+ else:
861
+ filtered_reviews = pd.DataFrame()
862
+
863
+ if not filtered_reviews.empty:
864
+ previous_colour = str(filtered_reviews["color"].values[0])
865
+ previous_id = filtered_reviews["id"].values[0]
866
+ review_df.loc[review_df["_merge"]=="both", "color"] = review_df.loc[review_df["_merge"] == "both", "color"].apply(lambda _: colour)
867
+ else:
868
+ # Handle the case where no rows match the condition
869
+ print("No reviews found with _merge == 'both'")
870
+ previous_colour = '(0, 0, 0)'
871
+ review_df.loc[review_df["color"]==colour, "color"] = previous_colour
872
+ previous_id =''
873
 
874
  review_df.drop("_merge", axis=1, inplace=True)
875
 
876
+ # Ensure that all output coordinates are in proportional size
877
+ #page_sizes_df = pd.DataFrame(page_sizes)
878
+ #page_sizes_df .loc[:, "page"] = pd.to_numeric(page_sizes_df["page"], errors="coerce")
879
+ #print("review_df before divide:", review_df)
880
+ #print("page_sizes_df before divide:", page_sizes_df)
881
+ #review_df = divide_coordinates_by_page_sizes(review_df, page_sizes_df)
882
+ #print("review_df after divide:", review_df)
883
+
884
+ review_df = review_df[["image", "page", "label", "color", "xmin","ymin", "xmax", "ymax", "text", "id"]]
885
 
886
+ return review_df, previous_id, previous_colour
887
 
888
  def update_boxes_color(images: list, redaction_row_selection: pd.DataFrame, colour: tuple = (0, 255, 0)):
889
  """
tools/textract_batch_call.py CHANGED
@@ -164,7 +164,7 @@ def analyse_document_with_textract_api(
164
  }])
165
 
166
  # File path
167
- log_file_path = os.path.join(local_output_dir, "textract_job_log_files.csv")
168
 
169
  # Check if file exists
170
  file_exists = os.path.exists(log_file_path)
@@ -444,18 +444,16 @@ def load_in_textract_job_details(load_s3_jobs:str=LOAD_PREVIOUS_TEXTRACT_JOBS_S3
444
  '''
445
  Load in a dataframe of jobs previous submitted to the Textract API service.
446
  '''
447
-
448
  job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
449
 
450
  # Initialize boto3 clients
451
  session = boto3.Session(region_name=aws_region)
452
  s3_client = session.client('s3')
453
 
454
- local_output_path = f'{load_local_jobs_loc}/textract_job_log_files.csv'
455
 
456
  if load_s3_jobs == 'True':
457
-
458
- s3_output_key = f'{load_s3_jobs_loc}/textract_job_log_files.csv'
459
 
460
  try:
461
  s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
@@ -523,4 +521,10 @@ def download_textract_output(job_id:str,
523
  s3_client.download_file(output_bucket, output_file_key, local_file_path)
524
  print(f"Output file downloaded to: {local_file_path}")
525
  except Exception as e:
526
- print(f"Error downloading file: {e}")
 
 
 
 
 
 
 
164
  }])
165
 
166
  # File path
167
+ log_file_path = os.path.join(local_output_dir, "textract_document_jobs.csv")
168
 
169
  # Check if file exists
170
  file_exists = os.path.exists(log_file_path)
 
444
  '''
445
  Load in a dataframe of jobs previous submitted to the Textract API service.
446
  '''
 
447
  job_df = pd.DataFrame(columns=['job_id','file_name','job_type','signature_extraction','s3_location','job_date_time'])
448
 
449
  # Initialize boto3 clients
450
  session = boto3.Session(region_name=aws_region)
451
  s3_client = session.client('s3')
452
 
453
+ local_output_path = f'{load_local_jobs_loc}/textract_document_jobs.csv'
454
 
455
  if load_s3_jobs == 'True':
456
+ s3_output_key = f'{load_s3_jobs_loc}/textract_document_jobs.csv'
 
457
 
458
  try:
459
  s3_client.head_object(Bucket=document_redaction_bucket, Key=s3_output_key)
 
521
  s3_client.download_file(output_bucket, output_file_key, local_file_path)
522
  print(f"Output file downloaded to: {local_file_path}")
523
  except Exception as e:
524
+ print(f"Error downloading file: {e}")
525
+
526
+ def check_textract_outputs_exist(textract_output_found_checkbox):
527
+ if textract_output_found_checkbox == True:
528
+ print("Textract outputs found")
529
+ return
530
+ else: raise Exception("Relevant Tetract outputs not found. Please ensure you have selected to correct results output and you have uploaded the relevant document file in 'Choose document or image file...' above")