Spaces:
Sleeping
Sleeping
Commit
·
143e2cc
1
Parent(s):
c3a8cd7
App should now resize images that are too large before sending to Textract. Textract now more robust to failure. Improved reliability of json conversion to review dataframe
Browse files- app.py +1 -1
- tools/custom_image_analyser_engine.py +3 -3
- tools/file_conversion.py +69 -39
- tools/file_redaction.py +50 -20
- tools/helper_functions.py +1 -1
- tools/redaction_review.py +1 -1
app.py
CHANGED
@@ -321,7 +321,7 @@ with app:
|
|
321 |
###
|
322 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
323 |
|
324 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator]).\
|
325 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
326 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
327 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
|
|
321 |
###
|
322 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
323 |
|
324 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
|
325 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
326 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
327 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number], api_name="redact_doc").\
|
tools/custom_image_analyser_engine.py
CHANGED
@@ -637,9 +637,9 @@ class CustomImageAnalyzerEngine:
|
|
637 |
result_reset_pos.start = 0
|
638 |
result_reset_pos.end = len(relevant_text)
|
639 |
|
640 |
-
print("result_reset_pos:", result_reset_pos)
|
641 |
-
print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
642 |
-
print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
643 |
|
644 |
# Map the analyzer results to bounding boxes for this line
|
645 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
|
|
637 |
result_reset_pos.start = 0
|
638 |
result_reset_pos.end = len(relevant_text)
|
639 |
|
640 |
+
#print("result_reset_pos:", result_reset_pos)
|
641 |
+
#print("relevant_line_ocr_result:", relevant_line_ocr_result)
|
642 |
+
#print("ocr_results_with_children_line_level:", ocr_results_with_children_line_level)
|
643 |
|
644 |
# Map the analyzer results to bounding boxes for this line
|
645 |
line_results = self.map_analyzer_results_to_bounding_boxes(
|
tools/file_conversion.py
CHANGED
@@ -51,26 +51,40 @@ def is_pdf(filename):
|
|
51 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
52 |
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
53 |
|
|
|
|
|
|
|
|
|
54 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
55 |
try:
|
56 |
-
# Construct the full output directory path
|
57 |
output_dir = os.path.join(os.getcwd(), output_dir)
|
58 |
-
|
59 |
-
# Use the output_dir to construct the out_path
|
60 |
out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
|
61 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
62 |
-
|
63 |
if os.path.exists(out_path):
|
64 |
-
#
|
65 |
image = Image.open(out_path)
|
66 |
else:
|
67 |
-
#
|
68 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
69 |
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
70 |
image = image_l[0]
|
71 |
image = image.convert("L")
|
72 |
image.save(out_path, format="PNG")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
return page_num, out_path
|
|
|
74 |
except Exception as e:
|
75 |
print(f"Error processing page {page_num + 1}: {e}")
|
76 |
return page_num, None
|
@@ -683,14 +697,20 @@ def join_values_within_threshold(df1, df2):
|
|
683 |
print(final_df)
|
684 |
|
685 |
|
686 |
-
def convert_review_json_to_pandas_df(
|
|
|
|
|
|
|
687 |
# Flatten the data
|
688 |
-
|
689 |
|
690 |
-
|
691 |
-
|
|
|
|
|
|
|
692 |
#print("flattened_data:", flattened_data)
|
693 |
-
image_path =
|
694 |
|
695 |
# Use regex to find the number before .png
|
696 |
match = re.search(r'_(\d+)\.png$', image_path)
|
@@ -701,56 +721,66 @@ def convert_review_json_to_pandas_df(data:List[dict], text_join_data=pd.DataFram
|
|
701 |
else:
|
702 |
print("No number found before .png")
|
703 |
|
704 |
-
# Check if 'boxes' is in the
|
705 |
-
if 'boxes' not in
|
706 |
-
|
707 |
|
708 |
-
for box in
|
709 |
if 'text' not in box:
|
710 |
-
data_to_add = {"image": image_path, "page": reported_number, **box} # "text":
|
711 |
else:
|
712 |
-
data_to_add = {"image": image_path, "page": reported_number, "text":
|
713 |
#print("data_to_add:", data_to_add)
|
714 |
-
|
715 |
|
716 |
# Convert to a DataFrame
|
717 |
-
|
|
|
|
|
|
|
718 |
|
719 |
# Join on additional text data from decision output results if included
|
720 |
-
if not
|
721 |
-
#print("
|
722 |
-
#print("
|
723 |
-
|
724 |
-
|
725 |
-
|
|
|
|
|
726 |
# Round to the closest number divisible by 5
|
727 |
-
|
728 |
-
|
|
|
729 |
|
730 |
-
|
|
|
|
|
731 |
|
732 |
-
|
733 |
|
734 |
-
|
735 |
|
736 |
-
|
737 |
|
738 |
-
|
739 |
-
|
|
|
|
|
740 |
|
741 |
-
|
742 |
|
743 |
-
return
|
744 |
|
745 |
-
def convert_pandas_df_to_review_json(
|
746 |
'''
|
747 |
Convert a review csv to a json file for use by the Gradio Annotation object
|
748 |
'''
|
749 |
# Keep only necessary columns
|
750 |
-
|
751 |
|
752 |
# Group the DataFrame by the 'image' column
|
753 |
-
grouped_csv_pages =
|
754 |
|
755 |
# Create a list to hold the JSON data
|
756 |
json_data = []
|
@@ -758,7 +788,7 @@ def convert_pandas_df_to_review_json(df: pd.DataFrame, image_paths: List[Image.I
|
|
758 |
for n, pdf_image_path in enumerate(image_paths):
|
759 |
reported_page_number = int(n + 1)
|
760 |
|
761 |
-
if reported_page_number in
|
762 |
|
763 |
# Convert each relevant group to a list of box dictionaries
|
764 |
selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
|
|
|
51 |
CUSTOM_BOX_COLOUR = get_or_create_env_var("CUSTOM_BOX_COLOUR", "")
|
52 |
print(f'The value of CUSTOM_BOX_COLOUR is {CUSTOM_BOX_COLOUR}')
|
53 |
|
54 |
+
import os
|
55 |
+
from pdf2image import convert_from_path
|
56 |
+
from PIL import Image
|
57 |
+
|
58 |
def process_single_page(pdf_path: str, page_num: int, image_dpi: float, output_dir: str = 'input') -> tuple[int, str]:
|
59 |
try:
|
60 |
+
# Construct the full output directory path
|
61 |
output_dir = os.path.join(os.getcwd(), output_dir)
|
|
|
|
|
62 |
out_path = os.path.join(output_dir, f"{os.path.basename(pdf_path)}_{page_num}.png")
|
63 |
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
64 |
+
|
65 |
if os.path.exists(out_path):
|
66 |
+
# Load existing image
|
67 |
image = Image.open(out_path)
|
68 |
else:
|
69 |
+
# Convert PDF page to image
|
70 |
image_l = convert_from_path(pdf_path, first_page=page_num+1, last_page=page_num+1,
|
71 |
dpi=image_dpi, use_cropbox=True, use_pdftocairo=False)
|
72 |
image = image_l[0]
|
73 |
image = image.convert("L")
|
74 |
image.save(out_path, format="PNG")
|
75 |
+
|
76 |
+
# Check file size and resize if necessary
|
77 |
+
max_size = 5 * 1024 * 1024 # 5 MB in bytes
|
78 |
+
file_size = os.path.getsize(out_path)
|
79 |
+
if file_size >= max_size:
|
80 |
+
# Resize the image while maintaining aspect ratio
|
81 |
+
ratio = (max_size / file_size) ** 0.5
|
82 |
+
new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
|
83 |
+
image = image.resize(new_size, Image.ANTIALIAS)
|
84 |
+
image.save(out_path, format="PNG") # Overwrite with resized image
|
85 |
+
|
86 |
return page_num, out_path
|
87 |
+
|
88 |
except Exception as e:
|
89 |
print(f"Error processing page {page_num + 1}: {e}")
|
90 |
return page_num, None
|
|
|
697 |
print(final_df)
|
698 |
|
699 |
|
700 |
+
def convert_review_json_to_pandas_df(all_annotations:List[dict], redaction_decision_output:pd.DataFrame=pd.DataFrame()) -> pd.DataFrame:
|
701 |
+
'''
|
702 |
+
Convert the annotation json data to a dataframe format. Add on any text from the initial review_file dataframe by joining on pages/co-ordinates (doesn't work very well currently).
|
703 |
+
'''
|
704 |
# Flatten the data
|
705 |
+
flattened_annotation_data = []
|
706 |
|
707 |
+
if not isinstance(redaction_decision_output, pd.DataFrame):
|
708 |
+
redaction_decision_output = pd.DataFrame()
|
709 |
+
|
710 |
+
for annotation in all_annotations:
|
711 |
+
#print("annotation:", annotation)
|
712 |
#print("flattened_data:", flattened_data)
|
713 |
+
image_path = annotation["image"]
|
714 |
|
715 |
# Use regex to find the number before .png
|
716 |
match = re.search(r'_(\d+)\.png$', image_path)
|
|
|
721 |
else:
|
722 |
print("No number found before .png")
|
723 |
|
724 |
+
# Check if 'boxes' is in the annotation, if not, add an empty list
|
725 |
+
if 'boxes' not in annotation:
|
726 |
+
annotation['boxes'] = []
|
727 |
|
728 |
+
for box in annotation["boxes"]:
|
729 |
if 'text' not in box:
|
730 |
+
data_to_add = {"image": image_path, "page": reported_number, **box} # "text": annotation['text'],
|
731 |
else:
|
732 |
+
data_to_add = {"image": image_path, "page": reported_number, "text": annotation['text'], **box}
|
733 |
#print("data_to_add:", data_to_add)
|
734 |
+
flattened_annotation_data.append(data_to_add)
|
735 |
|
736 |
# Convert to a DataFrame
|
737 |
+
annotation_data_as_df = pd.DataFrame(flattened_annotation_data)
|
738 |
+
|
739 |
+
#print("redaction_decision_output:", redaction_decision_output)
|
740 |
+
#print("annotation_data_as_df:", annotation_data_as_df)
|
741 |
|
742 |
# Join on additional text data from decision output results if included
|
743 |
+
if not redaction_decision_output.empty:
|
744 |
+
#print("redaction_decision_output is not empty")
|
745 |
+
#print("redaction_decision_output:", redaction_decision_output)
|
746 |
+
#print("annotation_data_as_df:", annotation_data_as_df)
|
747 |
+
redaction_decision_output['page'] = redaction_decision_output['page'].astype(str)
|
748 |
+
annotation_data_as_df['page'] = annotation_data_as_df['page'].astype(str)
|
749 |
+
redaction_decision_output = redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page', 'text']]
|
750 |
+
|
751 |
# Round to the closest number divisible by 5
|
752 |
+
redaction_decision_output.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (redaction_decision_output[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
753 |
+
|
754 |
+
redaction_decision_output = redaction_decision_output.drop_duplicates(['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'])
|
755 |
|
756 |
+
#annotation_data_as_df[['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
757 |
+
|
758 |
+
annotation_data_as_df.loc[:, ['xmin1', 'ymin1', 'xmax1', 'ymax1']] = (annotation_data_as_df[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
759 |
|
760 |
+
annotation_data_as_df = annotation_data_as_df.merge(redaction_decision_output, left_on = ['xmin1', 'ymin1', 'xmax1', 'ymax1', 'label', 'page'], right_on = ['xmin', 'ymin', 'xmax', 'ymax', 'label', 'page'], how = "left", suffixes=("", "_y"))
|
761 |
|
762 |
+
annotation_data_as_df = annotation_data_as_df.drop(['xmin1', 'ymin1', 'xmax1', 'ymax1', 'xmin_y', 'ymin_y', 'xmax_y', 'ymax_y'], axis=1, errors="ignore")
|
763 |
|
764 |
+
annotation_data_as_df = annotation_data_as_df[["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]]
|
765 |
|
766 |
+
# Ensure required columns exist, filling with blank if they don't
|
767 |
+
for col in ["image", "page", "label", "color", "xmin", "ymin", "xmax", "ymax", "text"]:
|
768 |
+
if col not in annotation_data_as_df.columns:
|
769 |
+
annotation_data_as_df[col] = ''
|
770 |
|
771 |
+
annotation_data_as_df = annotation_data_as_df.sort_values(['page', 'ymin', 'xmin', 'label'])
|
772 |
|
773 |
+
return annotation_data_as_df
|
774 |
|
775 |
+
def convert_pandas_df_to_review_json(review_file_df: pd.DataFrame, image_paths: List[Image.Image]) -> List[dict]:
|
776 |
'''
|
777 |
Convert a review csv to a json file for use by the Gradio Annotation object
|
778 |
'''
|
779 |
# Keep only necessary columns
|
780 |
+
review_file_df = review_file_df[["image", "page", "xmin", "ymin", "xmax", "ymax", "color", "label"]]
|
781 |
|
782 |
# Group the DataFrame by the 'image' column
|
783 |
+
grouped_csv_pages = review_file_df.groupby('page')
|
784 |
|
785 |
# Create a list to hold the JSON data
|
786 |
json_data = []
|
|
|
788 |
for n, pdf_image_path in enumerate(image_paths):
|
789 |
reported_page_number = int(n + 1)
|
790 |
|
791 |
+
if reported_page_number in review_file_df["page"].values:
|
792 |
|
793 |
# Convert each relevant group to a list of box dictionaries
|
794 |
selected_csv_pages = grouped_csv_pages.get_group(reported_page_number)
|
tools/file_redaction.py
CHANGED
@@ -288,7 +288,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
288 |
|
289 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
290 |
|
291 |
-
pymupdf_doc,all_decision_process_table,log_files_output_paths,new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
292 |
prepared_pdf_image_paths,
|
293 |
language,
|
294 |
chosen_redact_entities,
|
@@ -314,6 +314,9 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
314 |
custom_recogniser_word_list,
|
315 |
redact_whole_page_list)
|
316 |
|
|
|
|
|
|
|
317 |
# Save Textract request metadata (if exists)
|
318 |
if new_request_metadata:
|
319 |
print("Request metadata:", new_request_metadata)
|
@@ -396,10 +399,11 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
396 |
json.dump(annotations_all_pages, f)
|
397 |
log_files_output_paths.append(out_annotation_file_path)
|
398 |
|
399 |
-
|
400 |
|
401 |
# Convert json to csv and also save this
|
402 |
#print("annotations_all_pages:", annotations_all_pages)
|
|
|
403 |
|
404 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
405 |
|
@@ -975,11 +979,11 @@ def redact_image_pdf(file_path:str,
|
|
975 |
if analysis_type == textract_option:
|
976 |
|
977 |
json_file_path = output_folder + file_name + "_textract.json"
|
978 |
-
|
979 |
|
980 |
if not os.path.exists(json_file_path):
|
981 |
print("No existing Textract results file found.")
|
982 |
-
|
983 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
984 |
#log_files_output_paths.append(json_file_path)
|
985 |
#request_metadata = request_metadata + "\n" + new_request_metadata
|
@@ -988,8 +992,12 @@ def redact_image_pdf(file_path:str,
|
|
988 |
# Open the file and load the JSON data
|
989 |
no_textract_file = False
|
990 |
print("Found existing Textract json results file.")
|
|
|
|
|
|
|
|
|
991 |
with open(json_file_path, 'r') as json_file:
|
992 |
-
|
993 |
|
994 |
###
|
995 |
|
@@ -1046,32 +1054,46 @@ def redact_image_pdf(file_path:str,
|
|
1046 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
1047 |
pdf_page_as_bytes = image_buffer.getvalue()
|
1048 |
|
1049 |
-
if not
|
1050 |
-
|
1051 |
-
|
1052 |
-
|
|
|
|
|
1053 |
|
1054 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1055 |
|
1056 |
else:
|
1057 |
# Check if the current reported_page_number exists in the loaded JSON
|
1058 |
-
page_exists = any(page['page_no'] == reported_page_number for page in
|
1059 |
|
1060 |
if not page_exists: # If the page does not exist, analyze again
|
1061 |
print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
|
1062 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1063 |
|
1064 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
1065 |
-
if "pages" not in
|
1066 |
-
|
1067 |
|
1068 |
# Append the new page data
|
1069 |
-
|
1070 |
|
1071 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
1072 |
else:
|
1073 |
# If the page exists, retrieve the data
|
1074 |
-
text_blocks = next(page['data'] for page in
|
1075 |
|
1076 |
|
1077 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
@@ -1214,7 +1236,10 @@ def redact_image_pdf(file_path:str,
|
|
1214 |
if analysis_type == textract_option:
|
1215 |
# Write the updated existing textract data back to the JSON file
|
1216 |
with open(json_file_path, 'w') as json_file:
|
1217 |
-
json.dump(
|
|
|
|
|
|
|
1218 |
|
1219 |
current_loop_page += 1
|
1220 |
|
@@ -1245,7 +1270,10 @@ def redact_image_pdf(file_path:str,
|
|
1245 |
if analysis_type == textract_option:
|
1246 |
# Write the updated existing textract data back to the JSON file
|
1247 |
with open(json_file_path, 'w') as json_file:
|
1248 |
-
json.dump(
|
|
|
|
|
|
|
1249 |
|
1250 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1251 |
|
@@ -1253,7 +1281,9 @@ def redact_image_pdf(file_path:str,
|
|
1253 |
# Write the updated existing textract data back to the JSON file
|
1254 |
|
1255 |
with open(json_file_path, 'w') as json_file:
|
1256 |
-
json.dump(
|
|
|
|
|
1257 |
|
1258 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1259 |
|
@@ -1495,7 +1525,7 @@ def create_text_redaction_process_results(analyser_results, analysed_bounding_bo
|
|
1495 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1496 |
|
1497 |
# Convert the new columns to integers (if needed)
|
1498 |
-
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float)
|
1499 |
|
1500 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
1501 |
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
|
|
288 |
|
289 |
print("Redacting file " + file_path_without_ext + " as an image-based file")
|
290 |
|
291 |
+
pymupdf_doc, all_decision_process_table, log_files_output_paths, new_request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number = redact_image_pdf(file_path,
|
292 |
prepared_pdf_image_paths,
|
293 |
language,
|
294 |
chosen_redact_entities,
|
|
|
314 |
custom_recogniser_word_list,
|
315 |
redact_whole_page_list)
|
316 |
|
317 |
+
|
318 |
+
print("log_files_output_paths at end of image redact function:", log_files_output_paths)
|
319 |
+
|
320 |
# Save Textract request metadata (if exists)
|
321 |
if new_request_metadata:
|
322 |
print("Request metadata:", new_request_metadata)
|
|
|
399 |
json.dump(annotations_all_pages, f)
|
400 |
log_files_output_paths.append(out_annotation_file_path)
|
401 |
|
402 |
+
print("Saving annotations to CSV")
|
403 |
|
404 |
# Convert json to csv and also save this
|
405 |
#print("annotations_all_pages:", annotations_all_pages)
|
406 |
+
#print("all_decision_process_table:", all_decision_process_table)
|
407 |
|
408 |
review_df = convert_review_json_to_pandas_df(annotations_all_pages, all_decision_process_table)
|
409 |
|
|
|
979 |
if analysis_type == textract_option:
|
980 |
|
981 |
json_file_path = output_folder + file_name + "_textract.json"
|
982 |
+
|
983 |
|
984 |
if not os.path.exists(json_file_path):
|
985 |
print("No existing Textract results file found.")
|
986 |
+
textract_data = {}
|
987 |
#text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
988 |
#log_files_output_paths.append(json_file_path)
|
989 |
#request_metadata = request_metadata + "\n" + new_request_metadata
|
|
|
992 |
# Open the file and load the JSON data
|
993 |
no_textract_file = False
|
994 |
print("Found existing Textract json results file.")
|
995 |
+
|
996 |
+
if json_file_path not in log_files_output_paths:
|
997 |
+
log_files_output_paths.append(json_file_path)
|
998 |
+
|
999 |
with open(json_file_path, 'r') as json_file:
|
1000 |
+
textract_data = json.load(json_file)
|
1001 |
|
1002 |
###
|
1003 |
|
|
|
1054 |
image.save(image_buffer, format='PNG') # Save as PNG, or adjust format if needed
|
1055 |
pdf_page_as_bytes = image_buffer.getvalue()
|
1056 |
|
1057 |
+
if not textract_data:
|
1058 |
+
try:
|
1059 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1060 |
+
|
1061 |
+
if json_file_path not in log_files_output_paths:
|
1062 |
+
log_files_output_paths.append(json_file_path)
|
1063 |
|
1064 |
+
textract_data = {"pages":[text_blocks]}
|
1065 |
+
except Exception as e:
|
1066 |
+
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
1067 |
+
textract_data = {"pages":[]}
|
1068 |
+
new_request_metadata = "Failed Textract API call"
|
1069 |
+
|
1070 |
+
request_metadata = request_metadata + "\n" + new_request_metadata
|
1071 |
|
1072 |
else:
|
1073 |
# Check if the current reported_page_number exists in the loaded JSON
|
1074 |
+
page_exists = any(page['page_no'] == reported_page_number for page in textract_data.get("pages", []))
|
1075 |
|
1076 |
if not page_exists: # If the page does not exist, analyze again
|
1077 |
print(f"Page number {reported_page_number} not found in existing Textract data. Analysing.")
|
1078 |
+
|
1079 |
+
try:
|
1080 |
+
text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1081 |
+
except Exception as e:
|
1082 |
+
print("Textract extraction for page", reported_page_number, "failed due to:", e)
|
1083 |
+
text_bocks = []
|
1084 |
+
new_request_metadata = "Failed Textract API call"
|
1085 |
|
1086 |
# Check if "pages" key exists, if not, initialize it as an empty list
|
1087 |
+
if "pages" not in textract_data:
|
1088 |
+
textract_data["pages"] = []
|
1089 |
|
1090 |
# Append the new page data
|
1091 |
+
textract_data["pages"].append(text_blocks)
|
1092 |
|
1093 |
request_metadata = request_metadata + "\n" + new_request_metadata
|
1094 |
else:
|
1095 |
# If the page exists, retrieve the data
|
1096 |
+
text_blocks = next(page['data'] for page in textract_data["pages"] if page['page_no'] == reported_page_number)
|
1097 |
|
1098 |
|
1099 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
|
|
1236 |
if analysis_type == textract_option:
|
1237 |
# Write the updated existing textract data back to the JSON file
|
1238 |
with open(json_file_path, 'w') as json_file:
|
1239 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1240 |
+
|
1241 |
+
if json_file_path not in log_files_output_paths:
|
1242 |
+
log_files_output_paths.append(json_file_path)
|
1243 |
|
1244 |
current_loop_page += 1
|
1245 |
|
|
|
1270 |
if analysis_type == textract_option:
|
1271 |
# Write the updated existing textract data back to the JSON file
|
1272 |
with open(json_file_path, 'w') as json_file:
|
1273 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1274 |
+
|
1275 |
+
if json_file_path not in log_files_output_paths:
|
1276 |
+
log_files_output_paths.append(json_file_path)
|
1277 |
|
1278 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1279 |
|
|
|
1281 |
# Write the updated existing textract data back to the JSON file
|
1282 |
|
1283 |
with open(json_file_path, 'w') as json_file:
|
1284 |
+
json.dump(textract_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1285 |
+
if json_file_path not in log_files_output_paths:
|
1286 |
+
log_files_output_paths.append(json_file_path)
|
1287 |
|
1288 |
return pymupdf_doc, all_decision_process_table, log_files_output_paths, request_metadata, annotations_all_pages, current_loop_page, page_break_return, all_line_level_ocr_results_df, comprehend_query_number
|
1289 |
|
|
|
1525 |
analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']] = analysed_bounding_boxes_df_new['boundingBox'].apply(pd.Series)
|
1526 |
|
1527 |
# Convert the new columns to integers (if needed)
|
1528 |
+
analysed_bounding_boxes_df_new.loc[:, ['xmin', 'ymin', 'xmax', 'ymax']] = (analysed_bounding_boxes_df_new[['xmin', 'ymin', 'xmax', 'ymax']].astype(float) / 5).round() * 5
|
1529 |
|
1530 |
analysed_bounding_boxes_df_text = analysed_bounding_boxes_df_new['result'].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
|
1531 |
analysed_bounding_boxes_df_text.columns = ["label", "start", "end", "score"]
|
tools/helper_functions.py
CHANGED
@@ -17,7 +17,7 @@ def reset_state_vars():
|
|
17 |
show_share_button=False,
|
18 |
show_remove_button=False,
|
19 |
interactive=False
|
20 |
-
)
|
21 |
|
22 |
def get_or_create_env_var(var_name, default_value):
|
23 |
# Get the environment variable if it exists
|
|
|
17 |
show_share_button=False,
|
18 |
show_remove_button=False,
|
19 |
interactive=False
|
20 |
+
), [], []
|
21 |
|
22 |
def get_or_create_env_var(var_name, default_value):
|
23 |
# Get the environment variable if it exists
|
tools/redaction_review.py
CHANGED
@@ -56,7 +56,7 @@ def update_annotator(image_annotator_object:AnnotatedImageData, page_num:int, re
|
|
56 |
recogniser_entities = []
|
57 |
recogniser_dataframe = pd.DataFrame()
|
58 |
|
59 |
-
if recogniser_dataframe_gr.
|
60 |
try:
|
61 |
review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
|
62 |
#print("review_dataframe['label']", review_dataframe["label"])
|
|
|
56 |
recogniser_entities = []
|
57 |
recogniser_dataframe = pd.DataFrame()
|
58 |
|
59 |
+
if recogniser_dataframe_gr.empty:
|
60 |
try:
|
61 |
review_dataframe = convert_review_json_to_pandas_df(image_annotator_object)[["page", "label"]]
|
62 |
#print("review_dataframe['label']", review_dataframe["label"])
|