Sean Pedrick-Case commited on
Commit
e06b754
·
unverified ·
2 Parent(s): 176c4ad 20b655f

Merge pull request #29 from seanpedrick-case/dev

Browse files

Added more checks for documents with differing media and cropbox sizes

.dockerignore CHANGED
@@ -19,4 +19,5 @@ logs/*
19
  config/*
20
  user_guide/*
21
  cdk/*
 
22
  web/*
 
19
  config/*
20
  user_guide/*
21
  cdk/*
22
+ cdk/config/*
23
  web/*
.gitignore CHANGED
@@ -20,4 +20,5 @@ config/*
20
  doc_redaction_amplify_app/*
21
  user_guide/*
22
  cdk/*
 
23
  web/*
 
20
  doc_redaction_amplify_app/*
21
  user_guide/*
22
  cdk/*
23
+ cdk/config/*
24
  web/*
DocRedactApp.spec CHANGED
@@ -43,7 +43,7 @@ exe = EXE(
43
  a.scripts,
44
  [],
45
  exclude_binaries=True,
46
- name='DocRedactApp_0.6.2',
47
  debug=False,
48
  bootloader_ignore_signals=False,
49
  strip=False,
@@ -62,5 +62,5 @@ coll = COLLECT(
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
- name='DocRedactApp_0.6.2',
66
  )
 
43
  a.scripts,
44
  [],
45
  exclude_binaries=True,
46
+ name='DocRedactApp',
47
  debug=False,
48
  bootloader_ignore_signals=False,
49
  strip=False,
 
62
  strip=False,
63
  upx=True,
64
  upx_exclude=[],
65
+ name='DocRedactApp',
66
  )
README.md CHANGED
@@ -10,7 +10,7 @@ license: agpl-3.0
10
  ---
11
  # Document redaction
12
 
13
- version: 0.6.5
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
 
10
  ---
11
  # Document redaction
12
 
13
+ version: 0.6.6
14
 
15
  Redact personally identifiable information (PII) from documents (pdf, images), open text, or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a walkthrough on how to use the app. Below is a very brief overview.
16
 
pyproject.toml CHANGED
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
 
5
  [project]
6
  name = "doc_redaction"
7
- version = "0.6.5"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
@@ -23,7 +23,7 @@ dependencies = [
23
  "spacy==3.8.4",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
- "gradio==5.29.0",
27
  "boto3==1.38.4",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
 
4
 
5
  [project]
6
  name = "doc_redaction"
7
+ version = "0.6.6"
8
  description = "Redact PDF/image-based documents, or CSV/XLSX files using a Gradio-based GUI interface"
9
  readme = "README.md"
10
  requires-python = ">=3.10"
 
23
  "spacy==3.8.4",
24
  # Direct URL dependency for spacy model
25
  "en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz",
26
+ "gradio==5.29.1",
27
  "boto3==1.38.4",
28
  "pyarrow==19.0.1",
29
  "openpyxl==3.1.5",
requirements.txt CHANGED
@@ -10,7 +10,7 @@ pandas==2.2.3
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
- gradio==5.29.0
14
  boto3==1.38.4
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
 
10
  scikit-learn==1.6.1
11
  spacy==3.8.4
12
  en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.29.1
14
  boto3==1.38.4
15
  pyarrow==19.0.1
16
  openpyxl==3.1.5
tools/config.py CHANGED
@@ -212,8 +212,8 @@ if LOGGING == 'True':
212
  ###
213
 
214
  # Create Tesseract and Poppler folders if you have installed them locally
215
- TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # e.g. tesseract/
216
- POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # e.g. poppler/poppler-24.02.0/Library/bin/
217
 
218
  if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
219
  if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
 
212
  ###
213
 
214
  # Create Tesseract and Poppler folders if you have installed them locally
215
+ TESSERACT_FOLDER = get_or_create_env_var('TESSERACT_FOLDER', "") # # If installing for Windows, install Tesseract 5.5.0 from here: https://github.com/UB-Mannheim/tesseract/wiki. Then this environment variable should point to the Tesseract folder e.g. tesseract/
216
+ POPPLER_FOLDER = get_or_create_env_var('POPPLER_FOLDER', "") # If installing on Windows,install Poppler from here https://github.com/oschwartz10612/poppler-windows. This variable needs to point to the poppler bin folder e.g. poppler/poppler-24.02.0/Library/bin/
217
 
218
  if TESSERACT_FOLDER: add_folder_to_path(TESSERACT_FOLDER)
219
  if POPPLER_FOLDER: add_folder_to_path(POPPLER_FOLDER)
tools/file_redaction.py CHANGED
@@ -8,7 +8,7 @@ import copy
8
 
9
  from tqdm import tqdm
10
  from PIL import Image, ImageChops, ImageFile, ImageDraw
11
- from typing import List, Dict, Tuple
12
  import pandas as pd
13
 
14
  from pdfminer.high_level import extract_pages
@@ -932,22 +932,70 @@ def convert_pikepdf_annotations_to_result_annotation_box(page:Page, annot:dict,
932
 
933
  return img_annotation_box, rect
934
 
935
- def set_cropbox_safely(page, original_cropbox):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
936
  """
937
- Sets the cropbox of a page, ensuring it's not larger than the mediabox.
938
- If the original cropbox is larger, the mediabox is used instead.
 
 
 
 
939
 
940
  Args:
941
- page: The PyMuPdf page object.
942
- original_cropbox: The fitz.Rect representing the desired cropbox.
943
  """
944
  mediabox = page.mediabox
945
- if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
946
- #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
947
  page.set_cropbox(mediabox)
948
  else:
949
  page.set_cropbox(original_cropbox)
950
 
 
951
  def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
952
 
953
  rect_height = page.rect.height
 
8
 
9
  from tqdm import tqdm
10
  from PIL import Image, ImageChops, ImageFile, ImageDraw
11
+ from typing import List, Dict, Tuple, Optional
12
  import pandas as pd
13
 
14
  from pdfminer.high_level import extract_pages
 
932
 
933
  return img_annotation_box, rect
934
 
935
+ # def set_cropbox_safely(page, original_cropbox):
936
+ # """
937
+ # Sets the cropbox of a page, ensuring it's not larger than the mediabox.
938
+ # If the original cropbox is larger, the mediabox is used instead.
939
+
940
+ # Args:
941
+ # page: The PyMuPdf page object.
942
+ # original_cropbox: The fitz.Rect representing the desired cropbox.
943
+ # """
944
+ # mediabox = page.mediabox
945
+ # if original_cropbox.width > mediabox.width or original_cropbox.height > mediabox.height:
946
+ # #print("Warning: Requested cropbox is larger than the mediabox. Using mediabox instead.")
947
+ # page.set_cropbox(mediabox)
948
+ # else:
949
+ # page.set_cropbox(original_cropbox)
950
+
951
+
952
+ def set_cropbox_safely(page: Page, original_cropbox: Optional[Rect]):
953
  """
954
+ Sets the cropbox of a PyMuPDF page safely and defensively.
955
+
956
+ If the 'original_cropbox' is valid (i.e., a fitz.Rect instance, not None, not empty,
957
+ not infinite, and fully contained within the page's mediabox), it is set as the cropbox.
958
+
959
+ Otherwise, the page's mediabox is used, and a warning is printed to explain why.
960
 
961
  Args:
962
+ page: The PyMuPDF page object.
963
+ original_cropbox: The Rect representing the desired cropbox.
964
  """
965
  mediabox = page.mediabox
966
+ reason_for_defaulting = ""
967
+
968
+ # Check for None
969
+ if original_cropbox is None:
970
+ reason_for_defaulting = "the original cropbox is None."
971
+ # Check for incorrect type
972
+ elif not isinstance(original_cropbox, Rect):
973
+ reason_for_defaulting = f"the original cropbox is not a fitz.Rect instance (got {type(original_cropbox)})."
974
+ else:
975
+ # Normalise the cropbox (ensures x0 < x1 and y0 < y1)
976
+ original_cropbox.normalize()
977
+
978
+ # Check for empty or infinite or out-of-bounds
979
+ if original_cropbox.is_empty:
980
+ reason_for_defaulting = f"the provided original cropbox {original_cropbox} is empty."
981
+ elif original_cropbox.is_infinite:
982
+ reason_for_defaulting = f"the provided original cropbox {original_cropbox} is infinite."
983
+ elif not mediabox.contains(original_cropbox):
984
+ reason_for_defaulting = (
985
+ f"the provided original cropbox {original_cropbox} is not fully contained "
986
+ f"within the page's mediabox {mediabox}."
987
+ )
988
+
989
+ if reason_for_defaulting:
990
+ print(
991
+ f"Warning (Page {page.number}): Cannot use original cropbox because {reason_for_defaulting} "
992
+ f"Defaulting to the page's mediabox as the cropbox."
993
+ )
994
  page.set_cropbox(mediabox)
995
  else:
996
  page.set_cropbox(original_cropbox)
997
 
998
+
999
  def redact_page_with_pymupdf(page:Page, page_annotations:dict, image:Image=None, custom_colours:bool=False, redact_whole_page:bool=False, convert_pikepdf_to_pymupdf_coords:bool=True, original_cropbox:List[Rect]=[], page_sizes_df:pd.DataFrame=pd.DataFrame()):
1000
 
1001
  rect_height = page.rect.height
tools/textract_batch_call.py CHANGED
@@ -338,19 +338,15 @@ def load_pdf_job_file_from_s3(
338
  RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
339
 
340
  try:
341
- print("load_s3_jobs_input_loc:", load_s3_jobs_input_loc)
342
  pdf_file_location = ''
343
  doc_file_name_no_extension_textbox = ''
344
 
345
  s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
346
  s3_input_key_prefix = s3_input_key_prefix + ".pdf"
347
- print("s3_input_key_prefix:", s3_input_key_prefix)
348
-
349
  local_input_file_path = os.path.join(local_output_dir, pdf_filename)
350
  local_input_file_path = local_input_file_path + ".pdf"
351
 
352
- print("input to s3 download:", s3_bucket_name, s3_input_key_prefix, local_input_file_path)
353
-
354
  download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
355
 
356
  pdf_file_location = [local_input_file_path]
 
338
  RUN_AWS_FUNCTIONS=RUN_AWS_FUNCTIONS):
339
 
340
  try:
 
341
  pdf_file_location = ''
342
  doc_file_name_no_extension_textbox = ''
343
 
344
  s3_input_key_prefix = os.path.join(load_s3_jobs_input_loc, pdf_filename).replace("\\", "/")
345
  s3_input_key_prefix = s3_input_key_prefix + ".pdf"
346
+
 
347
  local_input_file_path = os.path.join(local_output_dir, pdf_filename)
348
  local_input_file_path = local_input_file_path + ".pdf"
349
 
 
 
350
  download_file_from_s3(s3_bucket_name, s3_input_key_prefix, local_input_file_path, RUN_AWS_FUNCTIONS= RUN_AWS_FUNCTIONS)
351
 
352
  pdf_file_location = [local_input_file_path]