seanpedrickcase commited on
Commit
23f8ca3
·
1 Parent(s): f0c28d7

Fixed issue where redactions were sometimes not removing text underneath boxes. You can now redact in different colours from review page

Browse files
tools/aws_functions.py CHANGED
@@ -36,7 +36,7 @@ if RUN_AWS_FUNCTIONS == "1":
36
  bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
37
  session = boto3.Session()
38
 
39
- print("session:", session)
40
 
41
  except Exception as e:
42
  print("Could not start boto3 session:", e)
 
36
  bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
37
  session = boto3.Session()
38
 
39
+ #print("session:", session)
40
 
41
  except Exception as e:
42
  print("Could not start boto3 session:", e)
tools/file_redaction.py CHANGED
@@ -585,6 +585,9 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
585
  rect_height = page.rect.height
586
  rect_width = page.rect.width
587
 
 
 
 
588
  out_annotation_boxes = {}
589
  all_image_annotation_boxes = []
590
  image_path = ""
@@ -610,11 +613,17 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
610
  if isinstance(annot, dict):
611
  img_annotation_box = annot
612
 
613
- x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
 
 
 
614
 
615
  # Else should be CustomImageRecognizerResult
616
  else:
617
- x1, pymupdf_y1, x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
 
 
 
618
 
619
  img_annotation_box["xmin"] = annot.left
620
  img_annotation_box["ymin"] = annot.top
@@ -630,7 +639,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
630
 
631
  # Else it should be a pikepdf annotation object
632
  else:
633
- x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
 
 
 
634
 
635
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
636
 
@@ -657,17 +669,29 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
657
 
658
  all_image_annotation_boxes.append(img_annotation_box)
659
 
660
- # Calculate the middle y value and set height to 1 pixel
 
661
  middle_y = (pymupdf_y1 + pymupdf_y2) / 2
662
- rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2) # Small height in middle of word to remove text
663
 
664
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
665
- page.add_redact_annot(rect_single_pixel_height)
 
666
 
667
  # Set up drawing a black box over the whole rect
668
  shape = page.new_shape()
669
  shape.draw_rect(rect)
670
- shape.finish(color=(0, 0, 0), fill=(0, 0, 0)) # Black fill for the rectangle
 
 
 
 
 
 
 
 
 
 
671
  shape.commit()
672
 
673
  out_annotation_boxes = {
 
585
  rect_height = page.rect.height
586
  rect_width = page.rect.width
587
 
588
+ pymupdf_x1 = None
589
+ pymupdf_x2 = None
590
+
591
  out_annotation_boxes = {}
592
  all_image_annotation_boxes = []
593
  image_path = ""
 
613
  if isinstance(annot, dict):
614
  img_annotation_box = annot
615
 
616
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
617
+
618
+ x1 = pymupdf_x1
619
+ x2 = pymupdf_x2
620
 
621
  # Else should be CustomImageRecognizerResult
622
  else:
623
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
624
+
625
+ x1 = pymupdf_x1
626
+ x2 = pymupdf_x2
627
 
628
  img_annotation_box["xmin"] = annot.left
629
  img_annotation_box["ymin"] = annot.top
 
639
 
640
  # Else it should be a pikepdf annotation object
641
  else:
642
+ pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
643
+
644
+ x1 = pymupdf_x1
645
+ x2 = pymupdf_x2
646
 
647
  rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
648
 
 
669
 
670
  all_image_annotation_boxes.append(img_annotation_box)
671
 
672
+ # Calculate the middle y value and set a small height (not used)
673
+ #print("Rect:", rect)
674
  middle_y = (pymupdf_y1 + pymupdf_y2) / 2
675
+ rect_small_pixel_height = Rect(pymupdf_x1, middle_y - 2, pymupdf_x2, middle_y + 2) # Small height in middle of line
676
 
677
  # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
678
+ #page.add_redact_annot(rect)#rect_small_pixel_height)
679
+ page.add_redact_annot(rect_small_pixel_height)
680
 
681
  # Set up drawing a black box over the whole rect
682
  shape = page.new_shape()
683
  shape.draw_rect(rect)
684
+
685
+ def convert_color_to_range_0_1(color):
686
+ return tuple(component / 255 for component in color)
687
+
688
+ if img_annotation_box["color"][0] > 1:
689
+ out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
690
+ else:
691
+ out_colour = img_annotation_box["color"]
692
+
693
+ shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
694
+ #shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
695
  shape.commit()
696
 
697
  out_annotation_boxes = {
tools/redaction_review.py CHANGED
@@ -164,16 +164,16 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
164
 
165
  # If working with image docs
166
  if is_pdf(file_path) == False:
167
- unredacted_doc = Image.open(file_paths[-1])
168
 
169
- image = unredacted_doc
170
 
171
  # try:
172
  # image = Image.open(image_annotated['image'])
173
  # except:
174
  # image = Image.fromarray(image_annotated['image'].astype('uint8'))
175
 
176
- draw = ImageDraw.Draw(unredacted_doc)
177
 
178
  for img_annotation_box in image_annotated['boxes']:
179
  coords = [img_annotation_box["xmin"],
@@ -191,9 +191,9 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
191
 
192
  # If working with pdfs
193
  else:
194
- unredacted_doc = pymupdf.open(file_path)
195
 
196
- number_of_pages = unredacted_doc.page_count
197
 
198
  print("Saving pages to file.")
199
 
@@ -216,12 +216,12 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
216
  elif isinstance(image_loc, str):
217
  image = Image.open(image_loc)
218
 
219
- pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
220
  pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
221
 
222
  #try:
223
  out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
224
- unredacted_doc.save(out_pdf_file_path)
225
  output_files.append(out_pdf_file_path)
226
 
227
  # Save the gradio_annotation_boxes to a JSON file
 
164
 
165
  # If working with image docs
166
  if is_pdf(file_path) == False:
167
+ pdf_doc = Image.open(file_paths[-1])
168
 
169
+ image = pdf_doc
170
 
171
  # try:
172
  # image = Image.open(image_annotated['image'])
173
  # except:
174
  # image = Image.fromarray(image_annotated['image'].astype('uint8'))
175
 
176
+ draw = ImageDraw.Draw(pdf_doc)
177
 
178
  for img_annotation_box in image_annotated['boxes']:
179
  coords = [img_annotation_box["xmin"],
 
191
 
192
  # If working with pdfs
193
  else:
194
+ pdf_doc = pymupdf.open(file_path)
195
 
196
+ number_of_pages = pdf_doc.page_count
197
 
198
  print("Saving pages to file.")
199
 
 
216
  elif isinstance(image_loc, str):
217
  image = Image.open(image_loc)
218
 
219
+ pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
220
  pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
221
 
222
  #try:
223
  out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
224
+ pdf_doc.save(out_pdf_file_path)
225
  output_files.append(out_pdf_file_path)
226
 
227
  # Save the gradio_annotation_boxes to a JSON file