Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Dec 19, 2024

Commit

23f8ca3

1 Parent(s): f0c28d7

Fixed issue where redactions were sometimes not removing text underneath boxes. You can now redact in different colours from review page

Browse files

Files changed (3) hide show

tools/aws_functions.py +1 -1
tools/file_redaction.py +31 -7
tools/redaction_review.py +7 -7

tools/aws_functions.py CHANGED Viewed

@@ -36,7 +36,7 @@ if RUN_AWS_FUNCTIONS == "1":
         bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
         session = boto3.Session()
-        print("session:", session)
     except Exception as e:
         print("Could not start boto3 session:", e)

         bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
         session = boto3.Session()
+        #print("session:", session)
     except Exception as e:
         print("Could not start boto3 session:", e)

tools/file_redaction.py CHANGED Viewed

@@ -585,6 +585,9 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
     rect_height = page.rect.height
     rect_width = page.rect.width
     out_annotation_boxes = {}
     all_image_annotation_boxes = []
     image_path = ""
@@ -610,11 +613,17 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
             if isinstance(annot, dict):
                 img_annotation_box = annot
-                x1, pymupdf_y1, x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
             # Else should be CustomImageRecognizerResult
             else:
-                x1, pymupdf_y1, x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
                 img_annotation_box["xmin"] = annot.left
                 img_annotation_box["ymin"] = annot.top
@@ -630,7 +639,10 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
         # Else it should be a pikepdf annotation object
         else:
-            x1, pymupdf_y1, x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
             rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
@@ -657,17 +669,29 @@ def redact_page_with_pymupdf(page:Page, annotations_on_page, image = None):
         all_image_annotation_boxes.append(img_annotation_box)
-        # Calculate the middle y value and set height to 1 pixel
         middle_y = (pymupdf_y1 + pymupdf_y2) / 2
-        rect_single_pixel_height = Rect(x1, middle_y - 2, x2, middle_y + 2)  # Small height in middle of word to remove text
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
-        page.add_redact_annot(rect_single_pixel_height)
         # Set up drawing a black box over the whole rect
         shape = page.new_shape()
         shape.draw_rect(rect)
-        shape.finish(color=(0, 0, 0), fill=(0, 0, 0))  # Black fill for the rectangle
         shape.commit()
     out_annotation_boxes = {

     rect_height = page.rect.height
     rect_width = page.rect.width
+    pymupdf_x1 = None
+    pymupdf_x2 = None
     out_annotation_boxes = {}
     all_image_annotation_boxes = []
     image_path = ""
             if isinstance(annot, dict):
                 img_annotation_box = annot
+                pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_gradio_annotation_coords_to_pymupdf(page, annot, image)
+                x1 = pymupdf_x1
+                x2 = pymupdf_x2
             # Else should be CustomImageRecognizerResult
             else:
+                pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_image_coords_to_pymupdf(page, annot, image)
+                x1 = pymupdf_x1
+                x2 = pymupdf_x2
                 img_annotation_box["xmin"] = annot.left
                 img_annotation_box["ymin"] = annot.top
         # Else it should be a pikepdf annotation object
         else:
+            pymupdf_x1, pymupdf_y1, pymupdf_x2, pymupdf_y2 = convert_pikepdf_coords_to_pymupdf(page, annot)
+            x1 = pymupdf_x1
+            x2 = pymupdf_x2
             rect = Rect(x1, pymupdf_y1, x2, pymupdf_y2)
         all_image_annotation_boxes.append(img_annotation_box)
+        # Calculate the middle y value and set a small height (not used)
+        #print("Rect:", rect)
         middle_y = (pymupdf_y1 + pymupdf_y2) / 2
+        rect_small_pixel_height = Rect(pymupdf_x1, middle_y - 2, pymupdf_x2, middle_y + 2)  # Small height in middle of line
         # Add the annotation to the middle of the character line, so that it doesn't delete text from adjacent lines
+        #page.add_redact_annot(rect)#rect_small_pixel_height)
+        page.add_redact_annot(rect_small_pixel_height)
         # Set up drawing a black box over the whole rect
         shape = page.new_shape()
         shape.draw_rect(rect)
+        def convert_color_to_range_0_1(color):
+            return tuple(component / 255 for component in color)
+        if img_annotation_box["color"][0] > 1:
+            out_colour = convert_color_to_range_0_1(img_annotation_box["color"])
+        else:
+            out_colour = img_annotation_box["color"]
+        shape.finish(color=out_colour, fill=out_colour)  # Black fill for the rectangle
+        #shape.finish(color=(0, 0, 0))  # Black fill for the rectangle
         shape.commit()
     out_annotation_boxes = {

tools/redaction_review.py CHANGED Viewed

@@ -164,16 +164,16 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
     # If working with image docs
     if is_pdf(file_path) == False:
-        unredacted_doc = Image.open(file_paths[-1])
-        image = unredacted_doc
         # try:
         #     image = Image.open(image_annotated['image'])
         # except:
         #     image = Image.fromarray(image_annotated['image'].astype('uint8'))
-        draw = ImageDraw.Draw(unredacted_doc)
         for img_annotation_box in image_annotated['boxes']:
             coords = [img_annotation_box["xmin"],
@@ -191,9 +191,9 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
     # If working with pdfs
     else:
-        unredacted_doc = pymupdf.open(file_path)
-        number_of_pages = unredacted_doc.page_count
         print("Saving pages to file.")
@@ -216,12 +216,12 @@ def apply_redactions(image_annotated:AnnotatedImageData, file_paths:str, doc:Doc
             elif isinstance(image_loc, str):
                 image = Image.open(image_loc)
-            pymupdf_page = unredacted_doc.load_page(i) #doc.load_page(current_page -1)
             pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
     #try:
     out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
-    unredacted_doc.save(out_pdf_file_path)
     output_files.append(out_pdf_file_path)
     # Save the gradio_annotation_boxes to a JSON file

     # If working with image docs
     if is_pdf(file_path) == False:
+        pdf_doc = Image.open(file_paths[-1])
+        image = pdf_doc
         # try:
         #     image = Image.open(image_annotated['image'])
         # except:
         #     image = Image.fromarray(image_annotated['image'].astype('uint8'))
+        draw = ImageDraw.Draw(pdf_doc)
         for img_annotation_box in image_annotated['boxes']:
             coords = [img_annotation_box["xmin"],
     # If working with pdfs
     else:
+        pdf_doc = pymupdf.open(file_path)
+        number_of_pages = pdf_doc.page_count
         print("Saving pages to file.")
             elif isinstance(image_loc, str):
                 image = Image.open(image_loc)
+            pymupdf_page = pdf_doc.load_page(i) #doc.load_page(current_page -1)
             pymupdf_page = redact_page_with_pymupdf(pymupdf_page, all_image_annotations[i], image)
     #try:
     out_pdf_file_path = output_folder + file_base + "_redacted.pdf"
+    pdf_doc.save(out_pdf_file_path)
     output_files.append(out_pdf_file_path)
     # Save the gradio_annotation_boxes to a JSON file