Spaces:

strickvl
/

redaction-detector

Runtime error

App Files Files Community

Alex Strick van Linschoten commited on May 6, 2022

Commit

f4f594a

1 Parent(s): c6ed7c3

add area calculations and delete model

Browse files

Files changed (2) hide show

2022-01-15-vfnet-post-self-train.pth +0 -3
app.py +46 -4

2022-01-15-vfnet-post-self-train.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8db6b7adeef1d66f4e8684bdca6fb9fb4720ad149e4994e10a5af3e26bfc2507
-size 131183383

app.py CHANGED Viewed

@@ -13,7 +13,6 @@ from icevision.all import *
 from icevision.models.checkpoint import *
 from PIL import Image as PILImage
-# checkpoint_path = "./2022-01-15-vfnet-post-self-train.pth"
 checkpoint_path = "./allsynthetic-imgsize768.pth"
 checkpoint_and_model = model_from_checkpoint(checkpoint_path)
 model = checkpoint_and_model["model"]
@@ -33,11 +32,38 @@ learn = load_learner(
 labels = learn.dls.vocab
 def predict(pdf, confidence, generate_file):
     filename_without_extension = pdf.name[:-4]
     document = fitz.open(pdf.name)
     results = []
     images = []
     tmp_dir = tempfile.gettempdir()
     for page_num, page in enumerate(document, start=1):
         image_pixmap = page.get_pixmap()
@@ -77,6 +103,9 @@ def predict(pdf, confidence, generate_file):
         tmp_dir, filename_without_extension, "redacted_pages.pdf"
     )
     if generate_file:
         pdf = FPDF()
         pdf.set_auto_page_break(0)
         imagelist = sorted(
@@ -109,7 +138,11 @@ def predict(pdf, confidence, generate_file):
                     font_size=16,
                     label_color="#FF59D6",
                 )
-                print(pred_dict)
                 pred_dict["img"].save(
                     os.path.join(
                         tmp_dir, filename_without_extension, f"pred-{image}"
@@ -123,10 +156,19 @@ def predict(pdf, confidence, generate_file):
             )
         pdf.output(report, "F")
-    text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}."
     if generate_file:
-        return text_output, images, report
     else:
         return text_output, images, None

 from icevision.models.checkpoint import *
 from PIL import Image as PILImage
 checkpoint_path = "./allsynthetic-imgsize768.pth"
 checkpoint_and_model = model_from_checkpoint(checkpoint_path)
 model = checkpoint_and_model["model"]
 labels = learn.dls.vocab
+def get_content_area(pred_dict) -> int:
+    if "content" not in pred_dict["labels"]:
+        return 0
+    content_bboxes = [
+        pred_dict["bboxes"][idx]
+        for idx, label in enumerate(pred_dict["labels"])
+        if label == "content"
+    ]
+    cb = content_bboxes[0]
+    return (cb.xmax - cb.xmin) * (cb.ymax - cb.ymin)
+def get_redaction_area(pred_dict) -> int:
+    if "redaction" not in pred_dict["labels"]:
+        return 0
+    redaction_bboxes = [
+        pred_dict["bboxes"][idx]
+        for idx, label in enumerate(pred_dict["labels"])
+        if label == "redaction"
+    ]
+    return sum(
+        (bbox.xmax - bbox.xmin) * (bbox.ymax - bbox.ymin)
+        for bbox in redaction_bboxes
+    )
 def predict(pdf, confidence, generate_file):
     filename_without_extension = pdf.name[:-4]
     document = fitz.open(pdf.name)
     results = []
     images = []
+    total_redacted_image_areas = 0
     tmp_dir = tempfile.gettempdir()
     for page_num, page in enumerate(document, start=1):
         image_pixmap = page.get_pixmap()
         tmp_dir, filename_without_extension, "redacted_pages.pdf"
     )
     if generate_file:
+        total_image_areas = 0
+        total_content_areas = 0
+        total_redaction_area = 0
         pdf = FPDF()
         pdf.set_auto_page_break(0)
         imagelist = sorted(
                     font_size=16,
                     label_color="#FF59D6",
                 )
+                total_image_areas += pred_dict["width"] * pred_dict["height"]
+                total_content_areas += get_content_area(pred_dict)
+                total_redaction_area += get_redaction_area(pred_dict)
                 pred_dict["img"].save(
                     os.path.join(
                         tmp_dir, filename_without_extension, f"pred-{image}"
             )
         pdf.output(report, "F")
+    total_redaction_proportion = round(
+        (total_redaction_area / total_image_areas) * 100, 1
+    )
+    content_redaction_proportion = round(
+        (total_redaction_area / total_content_areas) * 100, 1
+    )
+    text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}. "
+    redaction_analysis = f"{total_redaction_proportion}% of the total area of the redacted pages was redacted. \n {content_redaction_proportion}% of the actual content of those redacted pages was redacted."
     if generate_file:
+        return text_output + redaction_analysis, images, report
     else:
         return text_output, images, None