Spaces:

seanpedrickcase
/

document_redaction

Running

@@ -43,7 +43,7 @@ exe = EXE(
     a.scripts,
     [],
     exclude_binaries=True,
-    name='DocRedactApp_0.6.2',
     debug=False,
     bootloader_ignore_signals=False,
     strip=False,
@@ -62,5 +62,5 @@ coll = COLLECT(
     strip=False,
     upx=True,
     upx_exclude=[],
-    name='DocRedactApp_0.6.2',
 )

     a.scripts,
     [],
     exclude_binaries=True,
+    name='DocRedactApp',
     debug=False,
     bootloader_ignore_signals=False,
     strip=False,
     strip=False,
     upx=True,
     upx_exclude=[],
+    name='DocRedactApp',
 )

README.md CHANGED Viewed

@@ -10,7 +10,7 @@ license: agpl-3.0
 ---
 # Document redaction
-version: 0.6.5
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.

 ---
 # Document redaction
+version: 0.6.6
 Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.

pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "doc_redaction"
-version = "0.6.5"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
@@ -23,7 +23,7 @@ dependencies = [
     "spacy==3.8.4",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
-    "gradio==5.29.0",
     "boto3==1.38.4",
     "pyarrow==19.0.1",
     "openpyxl==3.1.5",

 [project]
 name = "doc_redaction"
+version = "0.6.6"
 description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
 readme = "README.md"
 requires-python = ">=3.10"
     "spacy==3.8.4",
     # Direct URL dependency for spacy model
     "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
+    "gradio==5.29.1",
     "boto3==1.38.4",
     "pyarrow==19.0.1",
     "openpyxl==3.1.5",

requirements.txt CHANGED Viewed

@@ -10,7 +10,7 @@ pandas==2.2.3
 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
-gradio==5.29.0
 boto3==1.38.4
 pyarrow==19.0.1
 openpyxl==3.1.5

 scikit-learn==1.6.1
 spacy==3.8.4
 en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.29.1
 boto3==1.38.4
 pyarrow==19.0.1
 openpyxl==3.1.5

tools/config.py CHANGED Viewed

@@ -212,8 +212,8 @@ if LOGGING == 'True':
 ###
 # Create Tesseract and Poppler folders if you have installed them locally
-TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
-POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # e.g. poppler/poppler-24.02.0/Library/bin/
 if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
 if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)

 ###
 # Create Tesseract and Poppler folders if you have installed them locally
+TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") #  # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
+POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
 if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
 if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)

tools/file_redaction.py CHANGED Viewed

@@ -8,7 +8,7 @@ import copy
 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
-from typing import List, Dict, Tuple
 import pandas as pd
 from pdfminer.high_level import extract_pages
@@ -932,22 +932,70 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
     return img_annotation_box, rect
-def set_cropbox_safely(page, original_cropbox):
     """
-    Sets the cropbox of a page, ensuring it's not larger than the mediabox.
-    If the original cropbox is larger, the mediabox is used instead.
     Args:
-        page: The PyMuPdf page object.
-        original_cropbox: The fitz.Rect representing the desired cropbox.
     """
     mediabox = page.mediabox
-    if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
-        #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
         page.set_cropbox(mediabox)
     else:
         page.set_cropbox(original_cropbox)
 def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height

 from tqdm import tqdm
 from PIL import Image, ImageChops, ImageFile, ImageDraw
+from typing import List, Dict, Tuple, Optional
 import pandas as pd
 from pdfminer.high_level import extract_pages
     return img_annotation_box, rect
+# def set_cropbox_safely(page, original_cropbox):
+#     """
+#     Sets the cropbox of a page, ensuring it's not larger than the mediabox.
+#     If the original cropbox is larger, the mediabox is used instead.
+#     Args:
+#         page: The PyMuPdf page object.
+#         original_cropbox: The fitz.Rect representing the desired cropbox.
+#     """
+#     mediabox = page.mediabox
+#     if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
+#         #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
+#         page.set_cropbox(mediabox)
+#     else:
+#         page.set_cropbox(original_cropbox)
+def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
     """
+    Sets the cropbox of a PyMuPDF page safely and defensively.
+    If the 'original_cropbox' is valid (i.e., a fitz.Rect instance, not None, not empty,
+    not infinite, and fully contained within the page's mediabox), it is set as the cropbox.
+    Otherwise, the page's mediabox is used, and a warning is printed to explain why.
     Args:
+        page: The PyMuPDF page object.
+        original_cropbox: The Rect representing the desired cropbox.
     """
     mediabox = page.mediabox
+    reason_for_defaulting = ""
+    # Check for None
+    if original_cropbox is None:
+        reason_for_defaulting = "the original cropbox is None."
+    # Check for incorrect type
+    elif not isinstance(original_cropbox, Rect):
+        reason_for_defaulting = f"the original cropbox is not a fitz.Rect instance (got {type(original_cropbox)})."
+    else:
+        # Normalise the cropbox (ensures x0 < x1 and y0 < y1)
+        original_cropbox.normalize()
+        # Check for empty or infinite or out-of-bounds
+        if original_cropbox.is_empty:
+            reason_for_defaulting = f"the provided original cropbox {original_cropbox} is empty."
+        elif original_cropbox.is_infinite:
+            reason_for_defaulting = f"the provided original cropbox {original_cropbox} is infinite."
+        elif not mediabox.contains(original_cropbox):
+            reason_for_defaulting = (
+                f"the provided original cropbox {original_cropbox} is not fully contained "
+                f"within the page's mediabox {mediabox}."
+            )
+    if reason_for_defaulting:
+        print(
+            f"Warning (Page {page.number}): Cannot use original cropbox because {reason_for_defaulting} "
+            f"Defaulting to the page's mediabox as the cropbox."
+        )
         page.set_cropbox(mediabox)
     else:
         page.set_cropbox(original_cropbox)
 def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
     rect_height = page.rect.height

tools/textract_batch_call.py CHANGED Viewed

@@ -338,19 +338,15 @@ def load_pdf_job_file_from_s3(
     RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
     try:
-        print("load_s3_jobs_input_loc:", load_s3_jobs_input_loc)
         pdf_file_location = ''
         doc_file_name_no_extension_textbox = ''
         s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
         s3_input_key_prefix = s3_input_key_prefix + ".pdf"
-        print("s3_input_key_prefix:", s3_input_key_prefix)
         local_input_file_path = os.path.join(local_output_dir, pdf_filename)
         local_input_file_path = local_input_file_path + ".pdf"
-        print("input to s3 download:", s3_bucket_name, s3_input_key_prefix, local_input_file_path)
         download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
         pdf_file_location = [local_input_file_path]

     RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
     try:
         pdf_file_location = ''
         doc_file_name_no_extension_textbox = ''
         s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
         s3_input_key_prefix = s3_input_key_prefix + ".pdf"
         local_input_file_path = os.path.join(local_output_dir, pdf_filename)
         local_input_file_path = local_input_file_path + ".pdf"
         download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
         pdf_file_location = [local_input_file_path]