Commit
·
52e26c1
1
Parent(s):
ef4000e
Updated save options for ocr_outputs_with_words
Browse files- tools/file_conversion.py +4 -4
- tools/file_redaction.py +36 -38
- tools/helper_functions.py +0 -6
tools/file_conversion.py
CHANGED
@@ -1026,10 +1026,10 @@ def divide_coordinates_by_page_sizes(
|
|
1026 |
if divisors_numeric and "image_width" in df_abs.columns and "image_height" in df_abs.columns:
|
1027 |
# Use np.errstate to suppress warnings about division by zero or NaN if desired
|
1028 |
with np.errstate(divide='ignore', invalid='ignore'):
|
1029 |
-
df_abs[xmin] = df_abs[xmin] / df_abs["image_width"]
|
1030 |
-
df_abs[xmax] = df_abs[xmax] / df_abs["image_width"]
|
1031 |
-
df_abs[ymin] = df_abs[ymin] / df_abs["image_height"]
|
1032 |
-
df_abs[ymax] = df_abs[ymax] / df_abs["image_height"]
|
1033 |
# Replace potential infinities with NaN (optional, depending on desired outcome)
|
1034 |
df_abs.replace([np.inf, -np.inf], np.nan, inplace=True)
|
1035 |
else:
|
|
|
1026 |
if divisors_numeric and "image_width" in df_abs.columns and "image_height" in df_abs.columns:
|
1027 |
# Use np.errstate to suppress warnings about division by zero or NaN if desired
|
1028 |
with np.errstate(divide='ignore', invalid='ignore'):
|
1029 |
+
df_abs[xmin] = round(df_abs[xmin] / df_abs["image_width"],6)
|
1030 |
+
df_abs[xmax] = round(df_abs[xmax] / df_abs["image_width"],6)
|
1031 |
+
df_abs[ymin] = round(df_abs[ymin] / df_abs["image_height"],6)
|
1032 |
+
df_abs[ymax] = round(df_abs[ymax] / df_abs["image_height"],6)
|
1033 |
# Replace potential infinities with NaN (optional, depending on desired outcome)
|
1034 |
df_abs.replace([np.inf, -np.inf], np.nan, inplace=True)
|
1035 |
else:
|
tools/file_redaction.py
CHANGED
@@ -484,6 +484,20 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
484 |
# Output file paths names
|
485 |
orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
486 |
review_file_path = orig_pdf_file_path + '_review_file.csv'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
487 |
|
488 |
# Remove any existing review_file paths from the review file outputs
|
489 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
@@ -556,7 +570,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
556 |
all_line_level_ocr_results_df,
|
557 |
all_pages_decision_process_table,
|
558 |
pymupdf_doc,
|
559 |
-
|
560 |
pii_identification_method,
|
561 |
comprehend_query_number,
|
562 |
comprehend_client,
|
@@ -616,15 +630,13 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
616 |
duplication_file_path_outputs.append(ocr_file_path)
|
617 |
|
618 |
if all_page_line_level_ocr_results_with_words:
|
619 |
-
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
|
|
|
|
620 |
|
621 |
all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
|
622 |
|
623 |
-
# print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
624 |
-
|
625 |
-
file_name = get_file_name_without_type(file_path)
|
626 |
-
|
627 |
-
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words.json"
|
628 |
|
629 |
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
630 |
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
@@ -635,9 +647,17 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
635 |
|
636 |
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
|
637 |
|
638 |
-
|
|
|
|
|
639 |
|
640 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
641 |
|
642 |
# Convert the gradio annotation boxes to relative coordinates
|
643 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
@@ -703,8 +723,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
703 |
if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
|
704 |
else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
|
705 |
|
706 |
-
if total_textract_query_number > number_of_pages:
|
707 |
-
total_textract_query_number = number_of_pages
|
708 |
|
709 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
710 |
|
@@ -1431,8 +1450,6 @@ def redact_image_pdf(file_path:str,
|
|
1431 |
if not all_pages_decision_process_table.empty:
|
1432 |
all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
|
1433 |
|
1434 |
-
#all_line_level_ocr_results_list = [all_line_level_ocr_results_df.to_dict('records')]#[all_line_level_ocr_results_df]
|
1435 |
-
#all_pages_decision_process_list = [all_pages_decision_process_table.to_dict('records')]#[all_pages_decision_process_table]
|
1436 |
|
1437 |
# Go through each page
|
1438 |
for page_no in progress_bar:
|
@@ -1572,6 +1589,8 @@ def redact_image_pdf(file_path:str,
|
|
1572 |
|
1573 |
page_line_level_ocr_results, handwriting_or_signature_boxes, page_signature_recogniser_results, page_handwriting_recogniser_results, page_line_level_ocr_results_with_words = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1574 |
|
|
|
|
|
1575 |
# Convert to DataFrame and add to ongoing logging table
|
1576 |
line_level_ocr_results_df = pd.DataFrame([{
|
1577 |
'page': page_line_level_ocr_results['page'],
|
@@ -1582,8 +1601,6 @@ def redact_image_pdf(file_path:str,
|
|
1582 |
'height': result.height
|
1583 |
} for result in page_line_level_ocr_results['results']])
|
1584 |
|
1585 |
-
#all_line_level_ocr_results_list.append(line_level_ocr_results_df.to_dict('records'))
|
1586 |
-
|
1587 |
if not line_level_ocr_results_df.empty: # Ensure there are records to add
|
1588 |
all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
|
1589 |
|
@@ -1737,25 +1754,7 @@ def redact_image_pdf(file_path:str,
|
|
1737 |
json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
1738 |
|
1739 |
if textract_json_file_path not in log_files_output_paths:
|
1740 |
-
log_files_output_paths.append(textract_json_file_path)
|
1741 |
-
|
1742 |
-
all_page_line_level_ocr_results_with_words_json_file_path_textract = output_folder + file_name + "_ocr_results_with_words_textract.json"
|
1743 |
-
|
1744 |
-
with open(all_page_line_level_ocr_results_with_words_json_file_path_textract, 'w') as json_file:
|
1745 |
-
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
1746 |
-
|
1747 |
-
if all_page_line_level_ocr_results_with_words_json_file_path_textract not in log_files_output_paths:
|
1748 |
-
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path_textract)
|
1749 |
-
|
1750 |
-
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
1751 |
-
if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
|
1752 |
-
# Write the updated existing textract data back to the JSON file
|
1753 |
-
|
1754 |
-
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
1755 |
-
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
1756 |
-
|
1757 |
-
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
1758 |
-
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
1759 |
|
1760 |
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
|
1761 |
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
|
@@ -2338,7 +2337,6 @@ def redact_text_pdf(
|
|
2338 |
pass
|
2339 |
#print("Not redacting page:", page_no)
|
2340 |
|
2341 |
-
|
2342 |
# Join extracted text outputs for all lines together
|
2343 |
if not page_text_ocr_outputs.empty:
|
2344 |
#page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
@@ -2421,10 +2419,10 @@ def redact_text_pdf(
|
|
2421 |
all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
|
2422 |
all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
|
2423 |
|
2424 |
-
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
|
2425 |
|
2426 |
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
2427 |
-
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
2428 |
-
|
2429 |
|
2430 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
|
|
484 |
# Output file paths names
|
485 |
orig_pdf_file_path = output_folder + pdf_file_name_with_ext
|
486 |
review_file_path = orig_pdf_file_path + '_review_file.csv'
|
487 |
+
|
488 |
+
# Load in all_ocr_results_with_words if it exists as a file path and doesn't exist already
|
489 |
+
file_name = get_file_name_without_type(file_path)
|
490 |
+
|
491 |
+
if text_extraction_method == SELECTABLE_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_text.json"
|
492 |
+
elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_local_ocr.json"
|
493 |
+
elif text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION: file_ending = "_ocr_results_with_words_textract.json"
|
494 |
+
all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + file_ending
|
495 |
+
|
496 |
+
if not all_page_line_level_ocr_results_with_words:
|
497 |
+
if local_ocr_output_found_checkbox == True and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path):
|
498 |
+
all_page_line_level_ocr_results_with_words, is_missing, log_files_output_paths = load_and_convert_ocr_results_with_words_json(all_page_line_level_ocr_results_with_words_json_file_path, log_files_output_paths, page_sizes_df)
|
499 |
+
#original_all_page_line_level_ocr_results_with_words = all_page_line_level_ocr_results_with_words.copy()
|
500 |
+
|
501 |
|
502 |
# Remove any existing review_file paths from the review file outputs
|
503 |
if text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION or text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
|
|
570 |
all_line_level_ocr_results_df,
|
571 |
all_pages_decision_process_table,
|
572 |
pymupdf_doc,
|
573 |
+
all_page_line_level_ocr_results_with_words,
|
574 |
pii_identification_method,
|
575 |
comprehend_query_number,
|
576 |
comprehend_client,
|
|
|
630 |
duplication_file_path_outputs.append(ocr_file_path)
|
631 |
|
632 |
if all_page_line_level_ocr_results_with_words:
|
633 |
+
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
634 |
+
#
|
635 |
+
#if original_all_page_line_level_ocr_results_with_words != all_page_line_level_ocr_results_with_words:
|
636 |
|
637 |
all_page_line_level_ocr_results_with_words = merge_page_results(all_page_line_level_ocr_results_with_words)
|
638 |
|
639 |
+
# print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
|
|
|
|
|
|
|
|
640 |
|
641 |
with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
642 |
json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
|
|
647 |
|
648 |
all_page_line_level_ocr_results_with_words_df = divide_coordinates_by_page_sizes(all_page_line_level_ocr_results_with_words_df, page_sizes_df, xmin="line_x0", xmax="line_x1", ymin="line_y0", ymax="line_y1")
|
649 |
|
650 |
+
all_page_line_level_ocr_results_with_words_df.sort_values(["page", "line", "word_x0"], inplace=True)
|
651 |
+
all_page_line_level_ocr_results_with_words_df_file_path = (output_folder + file_name + file_ending).replace(".json", ".csv")
|
652 |
+
all_page_line_level_ocr_results_with_words_df.to_csv(all_page_line_level_ocr_results_with_words_df_file_path, index = None)
|
653 |
|
654 |
+
|
655 |
+
|
656 |
+
if all_page_line_level_ocr_results_with_words_json_file_path not in log_files_output_paths:
|
657 |
+
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_json_file_path)
|
658 |
+
|
659 |
+
if all_page_line_level_ocr_results_with_words_df_file_path not in log_files_output_paths:
|
660 |
+
log_files_output_paths.append(all_page_line_level_ocr_results_with_words_df_file_path)
|
661 |
|
662 |
# Convert the gradio annotation boxes to relative coordinates
|
663 |
# Convert annotations_all_pages to a consistent relative coordinate format output
|
|
|
723 |
if not review_file_path: review_out_file_paths = [prepared_pdf_file_paths[-1]]
|
724 |
else: review_out_file_paths = [prepared_pdf_file_paths[-1], review_file_path]
|
725 |
|
726 |
+
if total_textract_query_number > number_of_pages: total_textract_query_number = number_of_pages
|
|
|
727 |
|
728 |
return combined_out_message, out_file_paths, out_file_paths, gr.Number(value=latest_file_completed, label="Number of documents redacted", interactive=False, visible=False), log_files_output_paths, log_files_output_paths, estimated_time_taken_state, all_request_metadata_str, pymupdf_doc, annotations_all_pages_divide, gr.Number(value=current_loop_page, precision=0, interactive=False, label = "Last redacted page in document", visible=False), gr.Checkbox(value = True, label="Page break reached", visible=False), all_line_level_ocr_results_df, all_pages_decision_process_table, comprehend_query_number, review_out_file_paths, annotate_max_pages, annotate_max_pages, prepared_pdf_file_paths, pdf_image_file_paths, review_file_state, page_sizes, duplication_file_path_outputs, duplication_file_path_outputs, review_file_path, total_textract_query_number, ocr_file_path, all_page_line_level_ocr_results, all_page_line_level_ocr_results_with_words
|
729 |
|
|
|
1450 |
if not all_pages_decision_process_table.empty:
|
1451 |
all_pages_decision_process_list.extend(all_pages_decision_process_table.to_dict('records'))
|
1452 |
|
|
|
|
|
1453 |
|
1454 |
# Go through each page
|
1455 |
for page_no in progress_bar:
|
|
|
1589 |
|
1590 |
page_line_level_ocr_results, handwriting_or_signature_boxes, page_signature_recogniser_results, page_handwriting_recogniser_results, page_line_level_ocr_results_with_words = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1591 |
|
1592 |
+
all_page_line_level_ocr_results_with_words.append(page_line_level_ocr_results_with_words)
|
1593 |
+
|
1594 |
# Convert to DataFrame and add to ongoing logging table
|
1595 |
line_level_ocr_results_df = pd.DataFrame([{
|
1596 |
'page': page_line_level_ocr_results['page'],
|
|
|
1601 |
'height': result.height
|
1602 |
} for result in page_line_level_ocr_results['results']])
|
1603 |
|
|
|
|
|
1604 |
if not line_level_ocr_results_df.empty: # Ensure there are records to add
|
1605 |
all_line_level_ocr_results_list.extend(line_level_ocr_results_df.to_dict('records'))
|
1606 |
|
|
|
1754 |
json.dump(textract_data, json_file, separators=(",", ":")) # indent=4 makes the JSON file pretty-printed
|
1755 |
|
1756 |
if textract_json_file_path not in log_files_output_paths:
|
1757 |
+
log_files_output_paths.append(textract_json_file_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1758 |
|
1759 |
all_pages_decision_process_table = pd.DataFrame(all_pages_decision_process_list)
|
1760 |
all_line_level_ocr_results_df = pd.DataFrame(all_line_level_ocr_results_list)
|
|
|
2337 |
pass
|
2338 |
#print("Not redacting page:", page_no)
|
2339 |
|
|
|
2340 |
# Join extracted text outputs for all lines together
|
2341 |
if not page_text_ocr_outputs.empty:
|
2342 |
#page_text_ocr_outputs = page_text_ocr_outputs.sort_values(["top", "left"], ascending=[False, False]).reset_index(drop=True)
|
|
|
2419 |
all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
|
2420 |
all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
|
2421 |
|
2422 |
+
#all_page_line_level_ocr_results_with_words_json_file_path = output_folder + file_name + "_ocr_results_with_words_local_text.json"
|
2423 |
|
2424 |
#print("all_page_line_level_ocr_results_with_words:", all_page_line_level_ocr_results_with_words)
|
2425 |
+
#with open(all_page_line_level_ocr_results_with_words_json_file_path, 'w') as json_file:
|
2426 |
+
# json.dump(all_page_line_level_ocr_results_with_words, json_file, separators=(",", ":"))
|
2427 |
|
2428 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number, all_page_line_level_ocr_results_with_words
|
tools/helper_functions.py
CHANGED
@@ -252,16 +252,10 @@ def check_for_relevant_ocr_output_with_words(doc_file_name_no_extension_textbox:
|
|
252 |
print("No valid text extraction method found. Returning False")
|
253 |
return False
|
254 |
|
255 |
-
print("doc_file_name_no_extension_textbox:", doc_file_name_no_extension_textbox)
|
256 |
-
|
257 |
doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
|
258 |
|
259 |
-
print("doc_file_with_ending:", doc_file_with_ending)
|
260 |
-
|
261 |
local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
|
262 |
|
263 |
-
print("looking for file path:", local_ocr_output_path)
|
264 |
-
|
265 |
if os.path.exists(local_ocr_output_path):
|
266 |
print("Existing OCR with words analysis output file found.")
|
267 |
return True
|
|
|
252 |
print("No valid text extraction method found. Returning False")
|
253 |
return False
|
254 |
|
|
|
|
|
255 |
doc_file_with_ending = doc_file_name_no_extension_textbox + file_ending
|
256 |
|
|
|
|
|
257 |
local_ocr_output_path = os.path.join(output_folder, doc_file_with_ending)
|
258 |
|
|
|
|
|
259 |
if os.path.exists(local_ocr_output_path):
|
260 |
print("Existing OCR with words analysis output file found.")
|
261 |
return True
|