Spaces:
Sleeping
Sleeping
Commit
·
11770c9
1
Parent(s):
eafaaed
Fixed redaction of image files
Browse files- tools/file_conversion.py +9 -28
- tools/file_redaction.py +3 -41
tools/file_conversion.py
CHANGED
@@ -478,11 +478,12 @@ def prepare_image_or_pdf(
|
|
478 |
annotation["image"] = image_path
|
479 |
|
480 |
all_annotations_object.append(annotation)
|
481 |
-
|
482 |
-
#print("all_annotations_object:", all_annotations_object)
|
483 |
-
|
484 |
|
485 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
|
|
|
|
|
|
|
|
486 |
# Convert image to a pymupdf document
|
487 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
488 |
|
@@ -491,10 +492,12 @@ def prepare_image_or_pdf(
|
|
491 |
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
492 |
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
493 |
|
|
|
|
|
|
|
|
|
|
|
494 |
|
495 |
-
# Check if the file is an image type and the user selected text ocr option
|
496 |
-
elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
497 |
-
in_redact_method = tesseract_ocr_option
|
498 |
|
499 |
elif file_extension in ['.csv']:
|
500 |
review_file_csv = read_file(file)
|
@@ -618,12 +621,7 @@ def prepare_image_or_pdf(
|
|
618 |
out_message.append(out_time)
|
619 |
out_message_out = '\n'.join(out_message)
|
620 |
|
621 |
-
#if prepare_for_review == False:
|
622 |
number_of_pages = len(image_file_paths)
|
623 |
-
#else:
|
624 |
-
# number_of_pages = len(all_annotations_object)
|
625 |
-
|
626 |
-
#print("all_annotations_object at end:", all_annotations_object)
|
627 |
|
628 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
629 |
|
@@ -650,23 +648,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
|
|
650 |
|
651 |
return out_message, out_file_paths
|
652 |
|
653 |
-
# Example DataFrames
|
654 |
-
# df1 = pd.DataFrame({
|
655 |
-
# 'xmin': [10, 20, 30],
|
656 |
-
# 'xmax': [15, 25, 35],
|
657 |
-
# 'ymin': [40, 50, 60],
|
658 |
-
# 'ymax': [45, 55, 65],
|
659 |
-
# 'info1': ['A', 'B', 'C']
|
660 |
-
# })
|
661 |
-
|
662 |
-
# df2 = pd.DataFrame({
|
663 |
-
# 'xmin': [12, 18, 32],
|
664 |
-
# 'xmax': [14, 24, 34],
|
665 |
-
# 'ymin': [42, 48, 62],
|
666 |
-
# 'ymax': [44, 54, 66],
|
667 |
-
# 'info2': ['X', 'Y', 'Z']
|
668 |
-
# })
|
669 |
-
|
670 |
def join_values_within_threshold(df1, df2):
|
671 |
# Threshold for matching
|
672 |
threshold = 5
|
|
|
478 |
annotation["image"] = image_path
|
479 |
|
480 |
all_annotations_object.append(annotation)
|
|
|
|
|
|
|
481 |
|
482 |
elif is_pdf_or_image(file_path): # Alternatively, if it's an image
|
483 |
+
# Check if the file is an image type and the user selected text ocr option
|
484 |
+
if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
|
485 |
+
in_redact_method = tesseract_ocr_option
|
486 |
+
|
487 |
# Convert image to a pymupdf document
|
488 |
pymupdf_doc = pymupdf.open() # Create a new empty document
|
489 |
|
|
|
492 |
page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
|
493 |
page.insert_image(rect, filename=file_path) # Insert the image into the page
|
494 |
|
495 |
+
file_path_str = str(file_path)
|
496 |
+
|
497 |
+
image_file_paths = process_file(file_path_str, prepare_for_review)
|
498 |
+
|
499 |
+
print("Inserted image into PDF file")
|
500 |
|
|
|
|
|
|
|
501 |
|
502 |
elif file_extension in ['.csv']:
|
503 |
review_file_csv = read_file(file)
|
|
|
621 |
out_message.append(out_time)
|
622 |
out_message_out = '\n'.join(out_message)
|
623 |
|
|
|
624 |
number_of_pages = len(image_file_paths)
|
|
|
|
|
|
|
|
|
625 |
|
626 |
return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
|
627 |
|
|
|
648 |
|
649 |
return out_message, out_file_paths
|
650 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
651 |
def join_values_within_threshold(df1, df2):
|
652 |
# Threshold for matching
|
653 |
threshold = 5
|
tools/file_redaction.py
CHANGED
@@ -269,7 +269,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
269 |
print("Redacting file:", file_path_without_ext)
|
270 |
|
271 |
is_a_pdf = is_pdf(file_path) == True
|
272 |
-
if is_a_pdf == False:
|
273 |
# If user has not submitted a pdf, assume it's an image
|
274 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
275 |
in_redact_method = tesseract_ocr_option
|
@@ -753,8 +753,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
753 |
|
754 |
return page, out_annotation_boxes
|
755 |
|
756 |
-
|
757 |
-
|
758 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
759 |
|
760 |
all_bboxes = []
|
@@ -767,6 +765,8 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
|
|
767 |
# Process signature and handwriting results
|
768 |
if signature_recogniser_results or handwriting_recogniser_results:
|
769 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
|
|
|
|
770 |
merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
|
771 |
|
772 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
@@ -1085,44 +1085,6 @@ def redact_image_pdf(file_path:str,
|
|
1085 |
text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
1086 |
|
1087 |
|
1088 |
-
# if not os.path.exists(json_file_path):
|
1089 |
-
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1090 |
-
# log_files_output_paths.append(json_file_path)
|
1091 |
-
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1092 |
-
|
1093 |
-
# existing_data = {"pages":[text_blocks]}
|
1094 |
-
|
1095 |
-
|
1096 |
-
# else:
|
1097 |
-
# # Open the file and load the JSON data
|
1098 |
-
# print("Found existing Textract json results file.")
|
1099 |
-
# with open(json_file_path, 'r') as json_file:
|
1100 |
-
# existing_data = json.load(json_file)
|
1101 |
-
|
1102 |
-
# # Check if the current reported_page_number exists in the loaded JSON
|
1103 |
-
# page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
|
1104 |
-
|
1105 |
-
# if not page_exists: # If the page does not exist, analyze again
|
1106 |
-
# print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
|
1107 |
-
# text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
|
1108 |
-
|
1109 |
-
# # Check if "pages" key exists, if not, initialize it as an empty list
|
1110 |
-
# if "pages" not in existing_data:
|
1111 |
-
# existing_data["pages"] = []
|
1112 |
-
|
1113 |
-
# # Append the new page data
|
1114 |
-
# existing_data["pages"].append(text_blocks)
|
1115 |
-
|
1116 |
-
# # Write the updated existing_data back to the JSON file
|
1117 |
-
# with open(json_file_path, 'w') as json_file:
|
1118 |
-
# json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
|
1119 |
-
|
1120 |
-
# log_files_output_paths.append(json_file_path)
|
1121 |
-
# request_metadata = request_metadata + "\n" + new_request_metadata
|
1122 |
-
# else:
|
1123 |
-
# # If the page exists, retrieve the data
|
1124 |
-
# text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
1125 |
-
|
1126 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1127 |
|
1128 |
# Step 2: Analyze text and identify PII
|
|
|
269 |
print("Redacting file:", file_path_without_ext)
|
270 |
|
271 |
is_a_pdf = is_pdf(file_path) == True
|
272 |
+
if is_a_pdf == False and in_redact_method == text_ocr_option:
|
273 |
# If user has not submitted a pdf, assume it's an image
|
274 |
print("File is not a pdf, assuming that image analysis needs to be used.")
|
275 |
in_redact_method = tesseract_ocr_option
|
|
|
753 |
|
754 |
return page, out_annotation_boxes
|
755 |
|
|
|
|
|
756 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
757 |
|
758 |
all_bboxes = []
|
|
|
765 |
# Process signature and handwriting results
|
766 |
if signature_recogniser_results or handwriting_recogniser_results:
|
767 |
if "Redact all identified handwriting" in handwrite_signature_checkbox:
|
768 |
+
print("handwriting_recogniser_results:", handwriting_recogniser_results)
|
769 |
+
|
770 |
merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
|
771 |
|
772 |
if "Redact all identified signatures" in handwrite_signature_checkbox:
|
|
|
1085 |
text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
|
1086 |
|
1087 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1088 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1089 |
|
1090 |
# Step 2: Analyze text and identify PII
|