Commit
·
23f8ca3
1
Parent(s):
f0c28d7
Fixed issue where redactions were sometimes not removing text underneath boxes. You can now redact in different colours from review page
Browse files- tools/aws_functions.py +1 -1
- tools/file_redaction.py +31 -7
- tools/redaction_review.py +7 -7
tools/aws_functions.py
CHANGED
@@ -36,7 +36,7 @@ if RUN_AWS_FUNCTIONS == "1":
|
|
36 |
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
37 |
session = boto3.Session()
|
38 |
|
39 |
-
print("session:", session)
|
40 |
|
41 |
except Exception as e:
|
42 |
print("Could not start boto3 session:", e)
|
|
|
36 |
bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
|
37 |
session = boto3.Session()
|
38 |
|
39 |
+
#print("session:", session)
|
40 |
|
41 |
except Exception as e:
|
42 |
print("Could not start boto3 session:", e)
|
tools/file_redaction.py
CHANGED
@@ -585,6 +585,9 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
|
|
585 |
rect_height = page.rect.height
|
586 |
rect_width = page.rect.width
|
587 |
|
|
|
|
|
|
|
588 |
out_annotation_boxes = {}
|
589 |
all_image_annotation_boxes = []
|
590 |
image_path = ""
|
@@ -610,11 +613,17 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
|
|
610 |
if isinstance(annot, dict):
|
611 |
img_annotation_box = annot
|
612 |
|
613 |
-
|
|
|
|
|
|
|
614 |
|
615 |
# Else should be CustomImageRecognizerResult
|
616 |
else:
|
617 |
-
|
|
|
|
|
|
|
618 |
|
619 |
img_annotation_box["xmin"] = annot.left
|
620 |
img_annotation_box["ymin"] = annot.top
|
@@ -630,7 +639,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
|
|
630 |
|
631 |
# Else it should be a pikepdf annotation object
|
632 |
else:
|
633 |
-
|
|
|
|
|
|
|
634 |
|
635 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
|
636 |
|
@@ -657,17 +669,29 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
|
|
657 |
|
658 |
all_image_annotation_boxes.append(img_annotation_box)
|
659 |
|
660 |
-
# Calculate the middle y value and set height
|
|
|
661 |
middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
662 |
-
|
663 |
|
664 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
665 |
-
page.add_redact_annot(
|
|
|
666 |
|
667 |
# Set up drawing a black box over the whole rect
|
668 |
shape = page.new_shape()
|
669 |
shape.draw_rect(rect)
|
670 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
671 |
shape.commit()
|
672 |
|
673 |
out_annotation_boxes = {
|
|
|
585 |
rect_height = page.rect.height
|
586 |
rect_width = page.rect.width
|
587 |
|
588 |
+
pymupdf_x1 = None
|
589 |
+
pymupdf_x2 = None
|
590 |
+
|
591 |
out_annotation_boxes = {}
|
592 |
all_image_annotation_boxes = []
|
593 |
image_path = ""
|
|
|
613 |
if isinstance(annot, dict):
|
614 |
img_annotation_box = annot
|
615 |
|
616 |
+
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
|
617 |
+
|
618 |
+
x1 = pymupdf_x1
|
619 |
+
x2 = pymupdf_x2
|
620 |
|
621 |
# Else should be CustomImageRecognizerResult
|
622 |
else:
|
623 |
+
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
|
624 |
+
|
625 |
+
x1 = pymupdf_x1
|
626 |
+
x2 = pymupdf_x2
|
627 |
|
628 |
img_annotation_box["xmin"] = annot.left
|
629 |
img_annotation_box["ymin"] = annot.top
|
|
|
639 |
|
640 |
# Else it should be a pikepdf annotation object
|
641 |
else:
|
642 |
+
pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
|
643 |
+
|
644 |
+
x1 = pymupdf_x1
|
645 |
+
x2 = pymupdf_x2
|
646 |
|
647 |
rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
|
648 |
|
|
|
669 |
|
670 |
all_image_annotation_boxes.append(img_annotation_box)
|
671 |
|
672 |
+
# Calculate the middle y value and set a small height (not used)
|
673 |
+
#print("Rect:", rect)
|
674 |
middle_y = (pymupdf_y1 + pymupdf_y2) / 2
|
675 |
+
rect_small_pixel_height = Rect(pymupdf_x1, middle_y - 2, pymupdf_x2, middle_y + 2) # Small height in middle of line
|
676 |
|
677 |
# Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
|
678 |
+
#page.add_redact_annot(rect)#rect_small_pixel_height)
|
679 |
+
page.add_redact_annot(rect_small_pixel_height)
|
680 |
|
681 |
# Set up drawing a black box over the whole rect
|
682 |
shape = page.new_shape()
|
683 |
shape.draw_rect(rect)
|
684 |
+
|
685 |
+
def convert_color_to_range_0_1(color):
|
686 |
+
return tuple(component / 255 for component in color)
|
687 |
+
|
688 |
+
if img_annotation_box["color"][0] > 1:
|
689 |
+
out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
|
690 |
+
else:
|
691 |
+
out_colour = img_annotation_box["color"]
|
692 |
+
|
693 |
+
shape.finish(color=out_colour, fill=out_colour) # Black fill for the rectangle
|
694 |
+
#shape.finish(color=(0, 0, 0)) # Black fill for the rectangle
|
695 |
shape.commit()
|
696 |
|
697 |
out_annotation_boxes = {
|
tools/redaction_review.py
CHANGED
@@ -164,16 +164,16 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
164 |
|
165 |
# If working with image docs
|
166 |
if is_pdf(file_path) == False:
|
167 |
-
|
168 |
|
169 |
-
image =
|
170 |
|
171 |
# try:
|
172 |
# image = Image.open(image_annotated['image'])
|
173 |
# except:
|
174 |
# image = Image.fromarray(image_annotated['image'].astype('uint8'))
|
175 |
|
176 |
-
draw = ImageDraw.Draw(
|
177 |
|
178 |
for img_annotation_box in image_annotated['boxes']:
|
179 |
coords = [img_annotation_box["xmin"],
|
@@ -191,9 +191,9 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
191 |
|
192 |
# If working with pdfs
|
193 |
else:
|
194 |
-
|
195 |
|
196 |
-
number_of_pages =
|
197 |
|
198 |
print("Saving pages to file.")
|
199 |
|
@@ -216,12 +216,12 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
|
|
216 |
elif isinstance(image_loc, str):
|
217 |
image = Image.open(image_loc)
|
218 |
|
219 |
-
pymupdf_page =
|
220 |
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
221 |
|
222 |
#try:
|
223 |
out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
|
224 |
-
|
225 |
output_files.append(out_pdf_file_path)
|
226 |
|
227 |
# Save the gradio_annotation_boxes to a JSON file
|
|
|
164 |
|
165 |
# If working with image docs
|
166 |
if is_pdf(file_path) == False:
|
167 |
+
pdf_doc = Image.open(file_paths[-1])
|
168 |
|
169 |
+
image = pdf_doc
|
170 |
|
171 |
# try:
|
172 |
# image = Image.open(image_annotated['image'])
|
173 |
# except:
|
174 |
# image = Image.fromarray(image_annotated['image'].astype('uint8'))
|
175 |
|
176 |
+
draw = ImageDraw.Draw(pdf_doc)
|
177 |
|
178 |
for img_annotation_box in image_annotated['boxes']:
|
179 |
coords = [img_annotation_box["xmin"],
|
|
|
191 |
|
192 |
# If working with pdfs
|
193 |
else:
|
194 |
+
pdf_doc = pymupdf.open(file_path)
|
195 |
|
196 |
+
number_of_pages = pdf_doc.page_count
|
197 |
|
198 |
print("Saving pages to file.")
|
199 |
|
|
|
216 |
elif isinstance(image_loc, str):
|
217 |
image = Image.open(image_loc)
|
218 |
|
219 |
+
pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
|
220 |
pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
|
221 |
|
222 |
#try:
|
223 |
out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
|
224 |
+
pdf_doc.save(out_pdf_file_path)
|
225 |
output_files.append(out_pdf_file_path)
|
226 |
|
227 |
# Save the gradio_annotation_boxes to a JSON file
|