seanpedrickcase commited on
Commit
52e26c1
·
1 Parent(s): ef4000e

Updated save options for ocr_outputs_with_words

Browse files
tools/file_conversion.py CHANGED
@@ -1026,10 +1026,10 @@ def divide_coordinates_by_page_sizes(
1026
  if divisors_numeric and "image_width" in df_abs.columns and "image_height" in df_abs.columns:
1027
  # Use np.errstate to suppress warnings about division by zero or NaN if desired
1028
  with np.errstate(divide='ignore', invalid='ignore'):
1029
- df_abs[xmin] = df_abs[xmin] / df_abs["image_width"]
1030
- df_abs[xmax] = df_abs[xmax] / df_abs["image_width"]
1031
- df_abs[ymin] = df_abs[ymin] / df_abs["image_height"]
1032
- df_abs[ymax] = df_abs[ymax] / df_abs["image_height"]
1033
  # Replace potential infinities with NaN (optional, depending on desired outcome)
1034
  df_abs.replace([np.inf, -np.inf], np.nan, inplace=True)
1035
  else:
 
1026
  if divisors_numeric and "image_width" in df_abs.columns and "image_height" in df_abs.columns:
1027
  # Use np.errstate to suppress warnings about division by zero or NaN if desired
1028
  with np.errstate(divide='ignore', invalid='ignore'):
1029
+ df_abs[xmin] = round(df_abs[xmin] / df_abs["image_width"],6)
1030
+ df_abs[xmax] = round(df_abs[xmax] / df_abs["image_width"],6)
1031
+ df_abs[ymin] = round(df_abs[ymin] / df_abs["image_height"],6)
1032
+ df_abs[ymax] = round(df_abs[ymax] / df_abs["image_height"],6)
1033
  # Replace potential infinities with NaN (optional, depending on desired outcome)
1034
  df_abs.replace([np.inf, -np.inf], np.nan, inplace=True)
1035
  else:
tools/file_redaction.py CHANGED
@@ -484,6 +484,20 @@ def choose_and_run_redactor(file_paths:List[str],
484
  # Output file paths names
485
  orig_pdf_file_path = output_folder + pdf_file_name_with_ext
486
  review_file_path = orig_pdf_file_path + '_review_file.csv'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
  # Remove any existing review_file paths from the review file outputs
489
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
@@ -556,7 +570,7 @@ def choose_and_run_redactor(file_paths:List[str],
556
  all_line_level_ocr_results_df,
557
  all_pages_decision_process_table,
558
  pymupdf_doc,
559
- [], # All line level ocr results with words
560
  pii_identification_method,
561
  comprehend_query_number,
562
  comprehend_client,
@@ -616,15 +630,13 @@ def choose_and_run_redactor(file_paths:List[str],
616
  duplication_file_path_outputs.append(ocr_file_path)
617
 
618
  if all_page_line_level_ocr_results_with_words:
619
- #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
 
 
620
 
621
  all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
622
 
623
- # print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
624
-
625
- file_name = get_file_name_without_type(file_path)
626
-
627
- all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
628
 
629
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
630
  json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
@@ -635,9 +647,17 @@ def choose_and_run_redactor(file_paths:List[str],
635
 
636
  all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
637
 
638
- all_page_line_level_ocr_results_with_words_df_file_path = output_folder + file_name + "_ocr_results_with_words.csv"
 
 
639
 
640
- all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path)
 
 
 
 
 
 
641
 
642
  # Convert the gradio annotation boxes to relative coordinates
643
  # Convert annotations_all_pages to a consistent relative coordinate format output
@@ -703,8 +723,7 @@ def choose_and_run_redactor(file_paths:List[str],
703
  if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
704
  else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
705
 
706
- if total_textract_query_number > number_of_pages:
707
- total_textract_query_number = number_of_pages
708
 
709
  return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
710
 
@@ -1431,8 +1450,6 @@ def redact_image_pdf(file_path:str,
1431
  if not all_pages_decision_process_table.empty:
1432
  all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
1433
 
1434
- #all_line_level_ocr_results_list = [all_line_level_ocr_results_df.to_dict('records')]#[all_line_level_ocr_results_df]
1435
- #all_pages_decision_process_list = [all_pages_decision_process_table.to_dict('records')]#[all_pages_decision_process_table]
1436
 
1437
  # Go through each page
1438
  for page_no in progress_bar:
@@ -1572,6 +1589,8 @@ def redact_image_pdf(file_path:str,
1572
 
1573
  page_line_level_ocr_results, handwriting_or_signature_boxes, page_signature_recogniser_results, page_handwriting_recogniser_results, page_line_level_ocr_results_with_words = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1574
 
 
 
1575
  # Convert to DataFrame and add to ongoing logging table
1576
  line_level_ocr_results_df = pd.DataFrame([{
1577
  'page': page_line_level_ocr_results['page'],
@@ -1582,8 +1601,6 @@ def redact_image_pdf(file_path:str,
1582
  'height': result.height
1583
  } for result in page_line_level_ocr_results['results']])
1584
 
1585
- #all_line_level_ocr_results_list.append(line_level_ocr_results_df.to_dict('records'))
1586
-
1587
  if not line_level_ocr_results_df.empty: # Ensure there are records to add
1588
  all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
1589
 
@@ -1737,25 +1754,7 @@ def redact_image_pdf(file_path:str,
1737
  json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
1738
 
1739
  if textract_json_file_path not in log_files_output_paths:
1740
- log_files_output_paths.append(textract_json_file_path)
1741
-
1742
- all_page_line_level_ocr_results_with_words_json_file_path_textract = output_folder + file_name + "_ocr_results_with_words_textract.json"
1743
-
1744
- with open(all_page_line_level_ocr_results_with_words_json_file_path_textract, 'w') as json_file:
1745
- json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
1746
-
1747
- if all_page_line_level_ocr_results_with_words_json_file_path_textract not in log_files_output_paths:
1748
- log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path_textract)
1749
-
1750
- if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
1751
- if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
1752
- # Write the updated existing textract data back to the JSON file
1753
-
1754
- with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
1755
- json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
1756
-
1757
- if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
1758
- log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
1759
 
1760
  all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
1761
  all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
@@ -2338,7 +2337,6 @@ def redact_text_pdf(
2338
  pass
2339
  #print("Not redacting page:", page_no)
2340
 
2341
-
2342
  # Join extracted text outputs for all lines together
2343
  if not page_text_ocr_outputs.empty:
2344
  #page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
@@ -2421,10 +2419,10 @@ def redact_text_pdf(
2421
  all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
2422
  all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
2423
 
2424
- all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
2425
 
2426
  #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
2427
- with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
2428
- json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
2429
 
2430
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
 
484
  # Output file paths names
485
  orig_pdf_file_path = output_folder + pdf_file_name_with_ext
486
  review_file_path = orig_pdf_file_path + '_review_file.csv'
487
+
488
+ # Load in all_ocr_results_with_words if it exists as a file path and doesn't exist already
489
+ file_name = get_file_name_without_type(file_path)
490
+
491
+ if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_text.json"
492
+ elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_ocr.json"
493
+ elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_textract.json"
494
+ all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + file_ending
495
+
496
+ if not all_page_line_level_ocr_results_with_words:
497
+ if local_ocr_output_found_checkbox == True and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path):
498
+ all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
499
+ #original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
500
+
501
 
502
  # Remove any existing review_file paths from the review file outputs
503
  if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
 
570
  all_line_level_ocr_results_df,
571
  all_pages_decision_process_table,
572
  pymupdf_doc,
573
+ all_page_line_level_ocr_results_with_words,
574
  pii_identification_method,
575
  comprehend_query_number,
576
  comprehend_client,
 
630
  duplication_file_path_outputs.append(ocr_file_path)
631
 
632
  if all_page_line_level_ocr_results_with_words:
633
+ #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
634
+ #
635
+ #if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
636
 
637
  all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
638
 
639
+ # print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
 
 
 
 
640
 
641
  with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
642
  json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
 
647
 
648
  all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
649
 
650
+ all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
651
+ all_page_line_level_ocr_results_with_words_df_file_path = (output_folder + file_name + file_ending).replace(".json", ".csv")
652
+ all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None)
653
 
654
+
655
+
656
+ if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
657
+ log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
658
+
659
+ if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
660
+ log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
661
 
662
  # Convert the gradio annotation boxes to relative coordinates
663
  # Convert annotations_all_pages to a consistent relative coordinate format output
 
723
  if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
724
  else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
725
 
726
+ if total_textract_query_number > number_of_pages: total_textract_query_number = number_of_pages
 
727
 
728
  return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
729
 
 
1450
  if not all_pages_decision_process_table.empty:
1451
  all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
1452
 
 
 
1453
 
1454
  # Go through each page
1455
  for page_no in progress_bar:
 
1589
 
1590
  page_line_level_ocr_results, handwriting_or_signature_boxes, page_signature_recogniser_results, page_handwriting_recogniser_results, page_line_level_ocr_results_with_words = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1591
 
1592
+ all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
1593
+
1594
  # Convert to DataFrame and add to ongoing logging table
1595
  line_level_ocr_results_df = pd.DataFrame([{
1596
  'page': page_line_level_ocr_results['page'],
 
1601
  'height': result.height
1602
  } for result in page_line_level_ocr_results['results']])
1603
 
 
 
1604
  if not line_level_ocr_results_df.empty: # Ensure there are records to add
1605
  all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
1606
 
 
1754
  json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
1755
 
1756
  if textract_json_file_path not in log_files_output_paths:
1757
+ log_files_output_paths.append(textract_json_file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1758
 
1759
  all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
1760
  all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
 
2337
  pass
2338
  #print("Not redacting page:", page_no)
2339
 
 
2340
  # Join extracted text outputs for all lines together
2341
  if not page_text_ocr_outputs.empty:
2342
  #page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
 
2419
  all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
2420
  all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
2421
 
2422
+ #all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
2423
 
2424
  #print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
2425
+ #with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
2426
+ # json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
2427
 
2428
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
tools/helper_functions.py CHANGED
@@ -252,16 +252,10 @@ def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:
252
  print("No valid text extraction method found. Returning False")
253
  return False
254
 
255
- print("doc_file_name_no_extension_textbox:", doc_file_name_no_extension_textbox)
256
-
257
  doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
258
 
259
- print("doc_file_with_ending:", doc_file_with_ending)
260
-
261
  local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
262
 
263
- print("looking for file path:", local_ocr_output_path)
264
-
265
  if os.path.exists(local_ocr_output_path):
266
  print("Existing OCR with words analysis output file found.")
267
  return True
 
252
  print("No valid text extraction method found. Returning False")
253
  return False
254
 
 
 
255
  doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
256
 
 
 
257
  local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
258
 
 
 
259
  if os.path.exists(local_ocr_output_path):
260
  print("Existing OCR with words analysis output file found.")
261
  return True