seanpedrickcase commited on
Commit
11770c9
·
1 Parent(s): eafaaed

Fixed redaction of image files

Browse files
Files changed (2) hide show
  1. tools/file_conversion.py +9 -28
  2. tools/file_redaction.py +3 -41
tools/file_conversion.py CHANGED
@@ -478,11 +478,12 @@ def prepare_image_or_pdf(
478
  annotation["image"] = image_path
479
 
480
  all_annotations_object.append(annotation)
481
-
482
- #print("all_annotations_object:", all_annotations_object)
483
-
484
 
485
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
 
 
 
 
486
  # Convert image to a pymupdf document
487
  pymupdf_doc = pymupdf.open() # Create a new empty document
488
 
@@ -491,10 +492,12 @@ def prepare_image_or_pdf(
491
  page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
492
  page.insert_image(rect, filename=file_path) # Insert the image into the page
493
 
 
 
 
 
 
494
 
495
- # Check if the file is an image type and the user selected text ocr option
496
- elif file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
497
- in_redact_method = tesseract_ocr_option
498
 
499
  elif file_extension in ['.csv']:
500
  review_file_csv = read_file(file)
@@ -618,12 +621,7 @@ def prepare_image_or_pdf(
618
  out_message.append(out_time)
619
  out_message_out = '\n'.join(out_message)
620
 
621
- #if prepare_for_review == False:
622
  number_of_pages = len(image_file_paths)
623
- #else:
624
- # number_of_pages = len(all_annotations_object)
625
-
626
- #print("all_annotations_object at end:", all_annotations_object)
627
 
628
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
629
 
@@ -650,23 +648,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str],
650
 
651
  return out_message, out_file_paths
652
 
653
- # Example DataFrames
654
- # df1 = pd.DataFrame({
655
- # 'xmin': [10, 20, 30],
656
- # 'xmax': [15, 25, 35],
657
- # 'ymin': [40, 50, 60],
658
- # 'ymax': [45, 55, 65],
659
- # 'info1': ['A', 'B', 'C']
660
- # })
661
-
662
- # df2 = pd.DataFrame({
663
- # 'xmin': [12, 18, 32],
664
- # 'xmax': [14, 24, 34],
665
- # 'ymin': [42, 48, 62],
666
- # 'ymax': [44, 54, 66],
667
- # 'info2': ['X', 'Y', 'Z']
668
- # })
669
-
670
  def join_values_within_threshold(df1, df2):
671
  # Threshold for matching
672
  threshold = 5
 
478
  annotation["image"] = image_path
479
 
480
  all_annotations_object.append(annotation)
 
 
 
481
 
482
  elif is_pdf_or_image(file_path): # Alternatively, if it's an image
483
+ # Check if the file is an image type and the user selected text ocr option
484
+ if file_extension in ['.jpg', '.jpeg', '.png'] and in_redact_method == text_ocr_option:
485
+ in_redact_method = tesseract_ocr_option
486
+
487
  # Convert image to a pymupdf document
488
  pymupdf_doc = pymupdf.open() # Create a new empty document
489
 
 
492
  page = pymupdf_doc.new_page(width=img.width, height=img.height) # Add a new page
493
  page.insert_image(rect, filename=file_path) # Insert the image into the page
494
 
495
+ file_path_str = str(file_path)
496
+
497
+ image_file_paths = process_file(file_path_str, prepare_for_review)
498
+
499
+ print("Inserted image into PDF file")
500
 
 
 
 
501
 
502
  elif file_extension in ['.csv']:
503
  review_file_csv = read_file(file)
 
621
  out_message.append(out_time)
622
  out_message_out = '\n'.join(out_message)
623
 
 
624
  number_of_pages = len(image_file_paths)
 
 
 
 
625
 
626
  return out_message_out, converted_file_paths, image_file_paths, number_of_pages, number_of_pages, pymupdf_doc, all_annotations_object, review_file_csv
627
 
 
648
 
649
  return out_message, out_file_paths
650
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
651
  def join_values_within_threshold(df1, df2):
652
  # Threshold for matching
653
  threshold = 5
tools/file_redaction.py CHANGED
@@ -269,7 +269,7 @@ def choose_and_run_redactor(file_paths:List[str],
269
  print("Redacting file:", file_path_without_ext)
270
 
271
  is_a_pdf = is_pdf(file_path) == True
272
- if is_a_pdf == False:
273
  # If user has not submitted a pdf, assume it's an image
274
  print("File is not a pdf, assuming that image analysis needs to be used.")
275
  in_redact_method = tesseract_ocr_option
@@ -753,8 +753,6 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
753
 
754
  return page, out_annotation_boxes
755
 
756
-
757
-
758
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
759
 
760
  all_bboxes = []
@@ -767,6 +765,8 @@ def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_result
767
  # Process signature and handwriting results
768
  if signature_recogniser_results or handwriting_recogniser_results:
769
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
 
 
770
  merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
771
 
772
  if "Redact all identified signatures" in handwrite_signature_checkbox:
@@ -1085,44 +1085,6 @@ def redact_image_pdf(file_path:str,
1085
  text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1086
 
1087
 
1088
- # if not os.path.exists(json_file_path):
1089
- # text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1090
- # log_files_output_paths.append(json_file_path)
1091
- # request_metadata = request_metadata + "\n" + new_request_metadata
1092
-
1093
- # existing_data = {"pages":[text_blocks]}
1094
-
1095
-
1096
- # else:
1097
- # # Open the file and load the JSON data
1098
- # print("Found existing Textract json results file.")
1099
- # with open(json_file_path, 'r') as json_file:
1100
- # existing_data = json.load(json_file)
1101
-
1102
- # # Check if the current reported_page_number exists in the loaded JSON
1103
- # page_exists = any(page['page_no'] == reported_page_number for page in existing_data.get("pages", []))
1104
-
1105
- # if not page_exists: # If the page does not exist, analyze again
1106
- # print(f"Page number {reported_page_number} not found in existing data. Analyzing again.")
1107
- # text_blocks, new_request_metadata = analyse_page_with_textract(pdf_page_as_bytes, reported_page_number, textract_client, handwrite_signature_checkbox) # Analyse page with Textract
1108
-
1109
- # # Check if "pages" key exists, if not, initialize it as an empty list
1110
- # if "pages" not in existing_data:
1111
- # existing_data["pages"] = []
1112
-
1113
- # # Append the new page data
1114
- # existing_data["pages"].append(text_blocks)
1115
-
1116
- # # Write the updated existing_data back to the JSON file
1117
- # with open(json_file_path, 'w') as json_file:
1118
- # json.dump(existing_data, json_file, indent=4) # indent=4 makes the JSON file pretty-printed
1119
-
1120
- # log_files_output_paths.append(json_file_path)
1121
- # request_metadata = request_metadata + "\n" + new_request_metadata
1122
- # else:
1123
- # # If the page exists, retrieve the data
1124
- # text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1125
-
1126
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1127
 
1128
  # Step 2: Analyze text and identify PII
 
269
  print("Redacting file:", file_path_without_ext)
270
 
271
  is_a_pdf = is_pdf(file_path) == True
272
+ if is_a_pdf == False and in_redact_method == text_ocr_option:
273
  # If user has not submitted a pdf, assume it's an image
274
  print("File is not a pdf, assuming that image analysis needs to be used.")
275
  in_redact_method = tesseract_ocr_option
 
753
 
754
  return page, out_annotation_boxes
755
 
 
 
756
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
757
 
758
  all_bboxes = []
 
765
  # Process signature and handwriting results
766
  if signature_recogniser_results or handwriting_recogniser_results:
767
  if "Redact all identified handwriting" in handwrite_signature_checkbox:
768
+ print("handwriting_recogniser_results:", handwriting_recogniser_results)
769
+
770
  merged_bboxes.extend(copy.deepcopy(handwriting_recogniser_results))
771
 
772
  if "Redact all identified signatures" in handwrite_signature_checkbox:
 
1085
  text_blocks = next(page['data'] for page in existing_data["pages"] if page['page_no'] == reported_page_number)
1086
 
1087
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1088
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1089
 
1090
  # Step 2: Analyze text and identify PII