Spaces:
Runtime error
Runtime error
Alex Strick van Linschoten
commited on
Commit
·
f4f594a
1
Parent(s):
c6ed7c3
add area calculations and delete model
Browse files- 2022-01-15-vfnet-post-self-train.pth +0 -3
- app.py +46 -4
2022-01-15-vfnet-post-self-train.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:8db6b7adeef1d66f4e8684bdca6fb9fb4720ad149e4994e10a5af3e26bfc2507
|
| 3 |
-
size 131183383
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -13,7 +13,6 @@ from icevision.all import *
|
|
| 13 |
from icevision.models.checkpoint import *
|
| 14 |
from PIL import Image as PILImage
|
| 15 |
|
| 16 |
-
# checkpoint_path = "./2022-01-15-vfnet-post-self-train.pth"
|
| 17 |
checkpoint_path = "./allsynthetic-imgsize768.pth"
|
| 18 |
checkpoint_and_model = model_from_checkpoint(checkpoint_path)
|
| 19 |
model = checkpoint_and_model["model"]
|
|
@@ -33,11 +32,38 @@ learn = load_learner(
|
|
| 33 |
labels = learn.dls.vocab
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def predict(pdf, confidence, generate_file):
|
| 37 |
filename_without_extension = pdf.name[:-4]
|
| 38 |
document = fitz.open(pdf.name)
|
| 39 |
results = []
|
| 40 |
images = []
|
|
|
|
| 41 |
tmp_dir = tempfile.gettempdir()
|
| 42 |
for page_num, page in enumerate(document, start=1):
|
| 43 |
image_pixmap = page.get_pixmap()
|
|
@@ -77,6 +103,9 @@ def predict(pdf, confidence, generate_file):
|
|
| 77 |
tmp_dir, filename_without_extension, "redacted_pages.pdf"
|
| 78 |
)
|
| 79 |
if generate_file:
|
|
|
|
|
|
|
|
|
|
| 80 |
pdf = FPDF()
|
| 81 |
pdf.set_auto_page_break(0)
|
| 82 |
imagelist = sorted(
|
|
@@ -109,7 +138,11 @@ def predict(pdf, confidence, generate_file):
|
|
| 109 |
font_size=16,
|
| 110 |
label_color="#FF59D6",
|
| 111 |
)
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
pred_dict["img"].save(
|
| 114 |
os.path.join(
|
| 115 |
tmp_dir, filename_without_extension, f"pred-{image}"
|
|
@@ -123,10 +156,19 @@ def predict(pdf, confidence, generate_file):
|
|
| 123 |
)
|
| 124 |
pdf.output(report, "F")
|
| 125 |
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
if generate_file:
|
| 129 |
-
return text_output, images, report
|
| 130 |
else:
|
| 131 |
return text_output, images, None
|
| 132 |
|
|
|
|
| 13 |
from icevision.models.checkpoint import *
|
| 14 |
from PIL import Image as PILImage
|
| 15 |
|
|
|
|
| 16 |
checkpoint_path = "./allsynthetic-imgsize768.pth"
|
| 17 |
checkpoint_and_model = model_from_checkpoint(checkpoint_path)
|
| 18 |
model = checkpoint_and_model["model"]
|
|
|
|
| 32 |
labels = learn.dls.vocab
|
| 33 |
|
| 34 |
|
| 35 |
+
def get_content_area(pred_dict) -> int:
|
| 36 |
+
if "content" not in pred_dict["labels"]:
|
| 37 |
+
return 0
|
| 38 |
+
content_bboxes = [
|
| 39 |
+
pred_dict["bboxes"][idx]
|
| 40 |
+
for idx, label in enumerate(pred_dict["labels"])
|
| 41 |
+
if label == "content"
|
| 42 |
+
]
|
| 43 |
+
cb = content_bboxes[0]
|
| 44 |
+
return (cb.xmax - cb.xmin) * (cb.ymax - cb.ymin)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def get_redaction_area(pred_dict) -> int:
|
| 48 |
+
if "redaction" not in pred_dict["labels"]:
|
| 49 |
+
return 0
|
| 50 |
+
redaction_bboxes = [
|
| 51 |
+
pred_dict["bboxes"][idx]
|
| 52 |
+
for idx, label in enumerate(pred_dict["labels"])
|
| 53 |
+
if label == "redaction"
|
| 54 |
+
]
|
| 55 |
+
return sum(
|
| 56 |
+
(bbox.xmax - bbox.xmin) * (bbox.ymax - bbox.ymin)
|
| 57 |
+
for bbox in redaction_bboxes
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
def predict(pdf, confidence, generate_file):
|
| 62 |
filename_without_extension = pdf.name[:-4]
|
| 63 |
document = fitz.open(pdf.name)
|
| 64 |
results = []
|
| 65 |
images = []
|
| 66 |
+
total_redacted_image_areas = 0
|
| 67 |
tmp_dir = tempfile.gettempdir()
|
| 68 |
for page_num, page in enumerate(document, start=1):
|
| 69 |
image_pixmap = page.get_pixmap()
|
|
|
|
| 103 |
tmp_dir, filename_without_extension, "redacted_pages.pdf"
|
| 104 |
)
|
| 105 |
if generate_file:
|
| 106 |
+
total_image_areas = 0
|
| 107 |
+
total_content_areas = 0
|
| 108 |
+
total_redaction_area = 0
|
| 109 |
pdf = FPDF()
|
| 110 |
pdf.set_auto_page_break(0)
|
| 111 |
imagelist = sorted(
|
|
|
|
| 138 |
font_size=16,
|
| 139 |
label_color="#FF59D6",
|
| 140 |
)
|
| 141 |
+
|
| 142 |
+
total_image_areas += pred_dict["width"] * pred_dict["height"]
|
| 143 |
+
total_content_areas += get_content_area(pred_dict)
|
| 144 |
+
total_redaction_area += get_redaction_area(pred_dict)
|
| 145 |
+
|
| 146 |
pred_dict["img"].save(
|
| 147 |
os.path.join(
|
| 148 |
tmp_dir, filename_without_extension, f"pred-{image}"
|
|
|
|
| 156 |
)
|
| 157 |
pdf.output(report, "F")
|
| 158 |
|
| 159 |
+
total_redaction_proportion = round(
|
| 160 |
+
(total_redaction_area / total_image_areas) * 100, 1
|
| 161 |
+
)
|
| 162 |
+
content_redaction_proportion = round(
|
| 163 |
+
(total_redaction_area / total_content_areas) * 100, 1
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
text_output = f"A total of {len(redacted_pages)} pages were redacted. \n\n The redacted page numbers were: {', '.join(redacted_pages)}. "
|
| 167 |
+
|
| 168 |
+
redaction_analysis = f"{total_redaction_proportion}% of the total area of the redacted pages was redacted. \n {content_redaction_proportion}% of the actual content of those redacted pages was redacted."
|
| 169 |
|
| 170 |
if generate_file:
|
| 171 |
+
return text_output + redaction_analysis, images, report
|
| 172 |
else:
|
| 173 |
return text_output, images, None
|
| 174 |
|